summaryrefslogtreecommitdiff
path: root/src/tuning/routines/routine_tuner.hpp
blob: 2aa0b3ce0e1f69d714dc10a7bcf65aba72d0ea31 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
//   Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the part of the auto-tuner for tuning entire routines (i.e. switching
// between direct and in-direct GEMM kernels)
//
// =================================================================================================

#ifndef CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_
#define CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_

#include <exception>
#include <string>
#include <vector>
#include <assert.h>

#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"

namespace clblast {
// =================================================================================================

template <typename T>
void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device,
                             const std::string &tuner_name, const std::string& parameter_name) {
  const auto override_status = OverrideParameters(device(), tuner_name, PrecisionValue<T>(),
                                                  {{parameter_name, minimum_size}});
  if (override_status != StatusCode::kSuccess) {
    throw RuntimeError("OverrideParameters failed with status " + ToString(override_status));
  }
}

// Computes the best switching point
TuningResult GetBestResult(const std::vector<TuningResult>& scores) {
  auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
  const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison);
  return *best_configuration;
}

// Tunes at kernel-level
template <typename T, typename F>
void TuneKernelSelection(const Platform& platform, const Device& device, const Context& context,
                         Queue& queue, const Precision precision, F const &routine,
                         const size_t from, const size_t to, const size_t step, const size_t batch_count,
                         const size_t num_runs, const std::string &name, const std::string &tuner_name,
                         const std::string &family_name, const std::string& parameter_name) {

  // Buffers
  auto buffers = std::vector<Buffer<T>>{
      Buffer<T>(context, to * to * batch_count),
      Buffer<T>(context, to * to * batch_count),
      Buffer<T>(context, to * to * batch_count)
  };

  // In-direct version
  printf("\n* Testing the in-direct %s routine for m=n=k\n", name.c_str());
  ForceSelectIndirectFrom<T>(0, device, tuner_name, parameter_name);
  const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, routine);

  // Direct version
  printf("\n* Testing the direct %s routine for m=n=k\n", name.c_str());
  ForceSelectIndirectFrom<T>(batch_count * to + 1, device, tuner_name, parameter_name);
  const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, routine);

  // Determining final score and best kernel selection point
  assert(indirect.size() == direct.size());
  printf("\n* Collecting results\n");
  auto ratios = std::vector<double>(indirect.size());
  for (auto i = size_t{0}; i < indirect.size(); ++i) {
    ratios[i] = indirect[i].second / direct[i].second;
  }
  auto scores = std::vector<TuningResult>(ratios.size());
  for (auto i = size_t{0}; i < scores.size(); ++i) {
    auto score = 0;
    for (auto j = size_t{0}; j < i; ++j) { score += (ratios[j] <= 1.0); }
    for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); }
    const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones
    const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1);
    auto tuning_results = Configuration();
    tuning_results[parameter_name] = indirect[i].first;
    tuning_results["PRECISION"] = static_cast<size_t>(precision);
    scores[i] = TuningResult{
        name + "_kernel_selection",
        (relative_score * relative_score) * 100 + epsilon,  // squared for proper default computation
        tuning_results
    };
  }

  // Displaying results
  printf("|         || %12s indirect || %12s direct ||          |\n", name.c_str(), name.c_str());
  printf("|   m=n=k ||    ms    |   GFLOPS   ||    ms    |  GFLOPS  ||  score   | (lowest score == best switching point)\n");
  printf("x---------xx----------x------------xx----------x----------xx----------x\n");
  for (auto i = size_t{0}; i < indirect.size(); ++i) {
    assert(indirect[i].first == direct[i].first);
    const auto value = indirect[i].first;
    if (indirect[i].second != -1 && direct[i].second != -1) {
      const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6);
      const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6);
      printf("| %7zu || %8.2lf | %10.1lf || %8.2lf | %8.1lf || %8.3lf |\n",
             value, indirect[i].second, gflops_indirect, direct[i].second, gflops_direct, scores[i].score);
    }
  }
  printf("x---------xx----------x------------xx----------x----------xx----------x\n");
  printf("\n");

  const auto best_result = GetBestResult(scores);
  const auto best_switching_point = best_result.config.at(parameter_name);
  const auto best_string = parameter_name + "=" + ToString(best_switching_point);

  // Outputs the results as JSON to disk, including some meta-data
  const auto precision_string = std::to_string(static_cast<size_t>(precision));
  auto metadata = std::vector<std::pair<std::string,std::string>>{
      {"kernel_family", family_name},
      {"precision", precision_string},
      {"arg_from", ToString(from)},
      {"arg_to", ToString(to)},
      {"arg_step", ToString(step)},
      {"best_kernel", best_result.name},
      {"best_time", ToString(best_result.score)},
      {"best_parameters", best_string}
  };
  PrintTimingsToFileAsJSON("clblast_" + family_name + "_" + precision_string + ".json",
                           device, platform, metadata, scores);
}

// =================================================================================================
} // namespace clblast

// CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_
#endif