summaryrefslogtreecommitdiff
path: root/src/tuning
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-01-15 21:32:51 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2018-01-15 21:32:51 +0100
commit0e5eaa6eb9d3049da4d04ebd22d413883c036142 (patch)
treec338160211f31ab61ea67583fd189a369c920606 /src/tuning
parent6d52eb2956c294a46b43627c468813cf64f60b99 (diff)
Factored out the generic parts of the GEMM routine tuner
Diffstat (limited to 'src/tuning')
-rw-r--r--src/tuning/routines/routine_tuner.hpp122
-rw-r--r--src/tuning/routines/xgemm.cpp95
2 files changed, 133 insertions, 84 deletions
diff --git a/src/tuning/routines/routine_tuner.hpp b/src/tuning/routines/routine_tuner.hpp
new file mode 100644
index 00000000..c02c1b84
--- /dev/null
+++ b/src/tuning/routines/routine_tuner.hpp
@@ -0,0 +1,122 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the part of the auto-tuner for tuning entire routines (i.e. switching
+// between direct and in-direct GEMM kernels)
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_
+#define CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_
+
+#include <exception>
+#include <string>
+#include <vector>
+#include <assert.h>
+
+#include "utilities/utilities.hpp"
+#include "tuning/tuning.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device,
+ const std::string &name, const std::string& parameter_name) {
+ const auto override_status = OverrideParameters(device(), name + "Routine", PrecisionValue<T>(),
+ {{parameter_name, minimum_size}});
+ if (override_status != StatusCode::kSuccess) {
+ throw RuntimeError("OverrideParameters failed with status " + ToString(override_status));
+ }
+}
+
+template <typename T, typename F>
+std::vector<TuningResult> TuneKernelSelection(const Device& device, const Context& context,
+ Queue& queue, const Precision precision,
+ F const &routine, const size_t num_runs,
+ const std::string &name,
+ const std::string& parameter_name) {
+
+ // Values for m, n, and k
+ const auto from = size_t{64};
+ const auto to = size_t{2048};
+ const auto step = size_t{64};
+
+ // Buffers
+ auto buffers = std::vector<Buffer<T>>{
+ Buffer<T>(context, to * to),
+ Buffer<T>(context, to * to),
+ Buffer<T>(context, to * to)
+ };
+
+ // In-direct version
+ printf("\n* Testing the in-direct %s routine for m=n=k\n", name.c_str());
+ ForceSelectIndirectFrom<T>(0, device, name, parameter_name);
+ const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, routine);
+
+ // Direct version
+ printf("\n* Testing the direct %s routine for m=n=k\n", name.c_str());
+ ForceSelectIndirectFrom<T>(to + 1, device, name, parameter_name);
+ const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, routine);
+
+ // Determining final score and best kernel selection point
+ assert(indirect.size() == direct.size());
+ printf("\n* Collecting results\n");
+ auto ratios = std::vector<double>(indirect.size());
+ for (auto i = size_t{0}; i < indirect.size(); ++i) {
+ ratios[i] = indirect[i].second / direct[i].second;
+ }
+ auto scores = std::vector<TuningResult>(ratios.size());
+ for (auto i = size_t{0}; i < scores.size(); ++i) {
+ auto score = 0;
+ for (auto j = size_t{0}; j < i; ++j) { score += (ratios[j] <= 1.0); }
+ for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); }
+ const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones
+ const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1);
+ auto tuning_results = Configuration();
+ tuning_results[parameter_name] = indirect[i].first;
+ tuning_results["PRECISION"] = static_cast<size_t>(precision);
+ scores[i] = TuningResult{
+ name + "_kernel_selection",
+ (relative_score * relative_score) * 100 + epsilon, // squared for proper default computation
+ tuning_results
+ };
+ }
+
+ // Displaying results
+ printf("| || %8s indirect || %8s direct || |\n", name.c_str(), name.c_str());
+ printf("| m=n=k || ms | GFLOPS || ms | GFLOPS || score | (lowest score == best switching point)\n");
+ printf("x---------xx--------x----------xx--------x----------xx----------x\n");
+ for (auto i = size_t{0}; i < indirect.size(); ++i) {
+ assert(indirect[i].first == direct[i].first);
+ const auto value = indirect[i].first;
+ if (indirect[i].second != -1 && direct[i].second != -1) {
+ const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6);
+ const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6);
+ printf("| %7zu || %6.2lf | %8.1lf || %6.2lf | %8.1lf || %8.3lf |\n",
+ value, indirect[i].second, gflops_indirect, direct[i].second, gflops_direct, scores[i].score);
+ }
+ }
+ printf("x---------xx--------x----------xx--------x----------xx----------x\n");
+ printf("\n");
+ return scores;
+}
+
+// Computes the best switching point
+TuningResult GetBestResult(const std::vector<TuningResult>& scores) {
+ auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
+ const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison);
+ return *best_configuration;
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_
+#endif
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index 83db6104..de4bef71 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -15,10 +15,9 @@
#include <exception>
#include <string>
#include <vector>
-#include <assert.h>
#include "utilities/utilities.hpp"
-#include "tuning/tuning.hpp"
+#include "tuning/routines/routine_tuner.hpp"
namespace clblast {
// =================================================================================================
@@ -40,14 +39,7 @@ void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Bu
clReleaseEvent(event);
}
-template <typename T>
-void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device) {
- const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(),
- {{"XGEMM_MIN_INDIRECT_SIZE", minimum_size}});
- if (override_status != StatusCode::kSuccess) {
- throw RuntimeError("OverrideParameters failed with status " + ToString(override_status));
- }
-}
+// =================================================================================================
template <typename T>
void TuneXgemm(int argc, char* argv[]) {
@@ -59,11 +51,6 @@ void TuneXgemm(int argc, char* argv[]) {
const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10});
fprintf(stdout, "%s\n", help.c_str());
- // Values for m, n, and k
- const auto from = size_t{64};
- const auto to = size_t{2048};
- const auto step = size_t{64};
-
// OpenCL initialisation
const auto platform = Platform(platform_id);
const auto device = Device(platform, device_id);
@@ -74,81 +61,21 @@ void TuneXgemm(int argc, char* argv[]) {
const auto context = Context(device);
auto queue = Queue(context, device);
- // Buffers
- auto buffers = std::vector<Buffer<T>>{
- Buffer<T>(context, to * to),
- Buffer<T>(context, to * to),
- Buffer<T>(context, to * to)
- };
-
- // In-direct version
- printf("\n* Testing the in-direct GEMM routine for m=n=k\n");
- ForceSelectIndirectFrom<T>(0, device);
- const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
-
- // Direct version
- printf("\n* Testing the direct GEMM routine for m=n=k\n");
- ForceSelectIndirectFrom<T>(to + 1, device);
- const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
-
- // Determining final score and best kernel selection point
- assert(indirect.size() == direct.size());
- printf("\n* Collecting results\n");
- auto ratios = std::vector<double>(indirect.size());
- for (auto i = size_t{0}; i < indirect.size(); ++i) {
- ratios[i] = indirect[i].second / direct[i].second;
- }
- auto scores = std::vector<TuningResult>(ratios.size());
- for (auto i = size_t{0}; i < scores.size(); ++i) {
- auto score = 0;
- for (auto j = size_t{0}; j < i; ++j) { score += (ratios[j] <= 1.0); }
- for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); }
- const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones
- const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1);
- auto tuning_results = Configuration();
- tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = indirect[i].first;
- tuning_results["PRECISION"] = static_cast<size_t>(precision);
- scores[i] = TuningResult{
- "gemm_kernel_selection",
- (relative_score * relative_score) * 100 + epsilon, // squared for proper default computation
- tuning_results
- };
- }
-
- // Displaying results
- printf("| || indirect GEMM || direct GEMM || |\n");
- printf("| m=n=k || ms | GFLOPS || ms | GFLOPS || score | (lowest score == best switching point)\n");
- printf("x---------xx--------x----------xx--------x----------xx----------x\n");
- for (auto i = size_t{0}; i < indirect.size(); ++i) {
- assert(indirect[i].first == direct[i].first);
- const auto value = indirect[i].first;
- if (indirect[i].second != -1 && direct[i].second != -1) {
- const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6);
- const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6);
- printf("| %7zu || %6.2lf | %8.1lf || %6.2lf | %8.1lf || %8.3lf |\n",
- value, indirect[i].second, gflops_indirect, direct[i].second, gflops_direct, scores[i].score);
- }
- }
- printf("x---------xx--------x----------xx--------x----------xx----------x\n");
- printf("\n");
-
- // Computes the best switching point
- auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
- const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison);
- const auto best_switching_point = best_configuration->config["XGEMM_MIN_INDIRECT_SIZE"];
- const auto best_string = "XGEMM_MIN_INDIRECT_SIZE=" + ToString(best_switching_point);
+ // Run the tuners for the XGEMM routine
+ const auto scores = TuneKernelSelection<T>(device, context, queue, precision, RunGemmRoutine<T>,
+ num_runs, "Gemm", "XGEMM_MIN_INDIRECT_SIZE");
+ const auto xgemm_best = GetBestResult(scores);
+ const auto xgemm_switching_point = xgemm_best.config.at("XGEMM_MIN_INDIRECT_SIZE");
+ const auto xgemm_string = "XGEMM_MIN_INDIRECT_SIZE=" + ToString(xgemm_switching_point);
// Outputs the results as JSON to disk, including some meta-data
const auto precision_string = std::to_string(static_cast<size_t>(precision));
auto metadata = std::vector<std::pair<std::string,std::string>>{
{"kernel_family", "gemm_routine"},
{"precision", precision_string},
- {"arg_from", ToString(from)},
- {"arg_to", ToString(to)},
- {"arg_step", ToString(step)},
- {"best_kernel", best_configuration->name},
- {"best_time", ToString(best_configuration->score)},
- {"best_parameters", best_string}
+ {"best_kernel", xgemm_best.name},
+ {"best_time", ToString(xgemm_best.score)},
+ {"best_parameters", xgemm_string}
};
PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json",
device, platform, metadata, scores);