Merge pull request #244 from CNugteren/kernel_selection_batched_gemm

Kernel selection for batched GEMM
author: Cedric Nugteren <web@cedricnugteren.nl> 2018-01-20 10:19:28 +0100
committer: GitHub <noreply@github.com> 2018-01-20 10:19:28 +0100
commit: b2c946c517736b3e33a5a651a907c7fb10d646f6 (patch)
tree: 8253935a3b251cda5ffaf6b1d7e78dde13610cc5
parent: b35e3d1e5326cdc257daa170eb243800616cfc26 (diff)
parent: c3f9371d16a66fa28906a3be9925a646e72ea471 (diff)
5 files changed, 189 insertions, 93 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18254658..a6aae958 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -388,6 +388,7 @@ if(TUNERS)
       src/utilities/utilities.hpp
       src/tuning/configurations.hpp
       src/tuning/tuning.hpp
+      src/tuning/routines/routine_tuner.hpp
       src/kernel_preprocessor.hpp)
   set(TUNERS_COMMON ${TUNERS_COMMON} ${TUNERS_HEADERS})
 
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
index 1c0953e8..6a089b8a 100644
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -96,7 +96,7 @@ void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_trans
   }
 
   // Selects which version of the batched GEMM to run
-  const auto do_gemm_direct = true;
+  const auto do_gemm_direct = Xgemm<T>::UseDirectKernel(m, n, k, db_["XGEMM_MIN_INDIRECT_SIZE"]);
   if (do_gemm_direct) { // single generic kernel
     BatchedGemmDirect(m, n, k, alphas_device,
                       a_buffer, a_offsets_int, a_ld, b_buffer, b_offsets_int, b_ld,
diff --git a/src/routines/levelx/xgemmstridedbatched.cpp b/src/routines/levelx/xgemmstridedbatched.cpp
index affbceee..6165a396 100644
--- a/src/routines/levelx/xgemmstridedbatched.cpp
+++ b/src/routines/levelx/xgemmstridedbatched.cpp
@@ -76,7 +76,7 @@ void XgemmStridedBatched<T>::DoGemmStridedBatched(const Layout layout, const Tra
   }
 
   // Selects which version of the batched GEMM to run
-  const auto do_gemm_direct = true;
+  const auto do_gemm_direct = Xgemm<T>::UseDirectKernel(m, n, k, db_["XGEMM_MIN_INDIRECT_SIZE"]);;
   if (do_gemm_direct) { // single generic kernel
     BatchedGemmDirect(m, n, k, alpha,
                       a_buffer, a_offset, a_ld, a_stride,
diff --git a/src/tuning/routines/routine_tuner.hpp b/src/tuning/routines/routine_tuner.hpp
new file mode 100644
index 00000000..2aa0b3ce
--- /dev/null
+++ b/src/tuning/routines/routine_tuner.hpp
@@ -0,0 +1,136 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the part of the auto-tuner for tuning entire routines (i.e. switching
+// between direct and in-direct GEMM kernels)
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_
+#define CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_
+
+#include <exception>
+#include <string>
+#include <vector>
+#include <assert.h>
+
+#include "utilities/utilities.hpp"
+#include "tuning/tuning.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device,
+                             const std::string &tuner_name, const std::string& parameter_name) {
+  const auto override_status = OverrideParameters(device(), tuner_name, PrecisionValue<T>(),
+                                                  {{parameter_name, minimum_size}});
+  if (override_status != StatusCode::kSuccess) {
+    throw RuntimeError("OverrideParameters failed with status " + ToString(override_status));
+  }
+}
+
+// Computes the best switching point
+TuningResult GetBestResult(const std::vector<TuningResult>& scores) {
+  auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
+  const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison);
+  return *best_configuration;
+}
+
+// Tunes at kernel-level
+template <typename T, typename F>
+void TuneKernelSelection(const Platform& platform, const Device& device, const Context& context,
+                         Queue& queue, const Precision precision, F const &routine,
+                         const size_t from, const size_t to, const size_t step, const size_t batch_count,
+                         const size_t num_runs, const std::string &name, const std::string &tuner_name,
+                         const std::string &family_name, const std::string& parameter_name) {
+
+  // Buffers
+  auto buffers = std::vector<Buffer<T>>{
+      Buffer<T>(context, to * to * batch_count),
+      Buffer<T>(context, to * to * batch_count),
+      Buffer<T>(context, to * to * batch_count)
+  };
+
+  // In-direct version
+  printf("\n* Testing the in-direct %s routine for m=n=k\n", name.c_str());
+  ForceSelectIndirectFrom<T>(0, device, tuner_name, parameter_name);
+  const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, routine);
+
+  // Direct version
+  printf("\n* Testing the direct %s routine for m=n=k\n", name.c_str());
+  ForceSelectIndirectFrom<T>(batch_count * to + 1, device, tuner_name, parameter_name);
+  const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, routine);
+
+  // Determining final score and best kernel selection point
+  assert(indirect.size() == direct.size());
+  printf("\n* Collecting results\n");
+  auto ratios = std::vector<double>(indirect.size());
+  for (auto i = size_t{0}; i < indirect.size(); ++i) {
+    ratios[i] = indirect[i].second / direct[i].second;
+  }
+  auto scores = std::vector<TuningResult>(ratios.size());
+  for (auto i = size_t{0}; i < scores.size(); ++i) {
+    auto score = 0;
+    for (auto j = size_t{0}; j < i; ++j) { score += (ratios[j] <= 1.0); }
+    for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); }
+    const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones
+    const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1);
+    auto tuning_results = Configuration();
+    tuning_results[parameter_name] = indirect[i].first;
+    tuning_results["PRECISION"] = static_cast<size_t>(precision);
+    scores[i] = TuningResult{
+        name + "_kernel_selection",
+        (relative_score * relative_score) * 100 + epsilon,  // squared for proper default computation
+        tuning_results
+    };
+  }
+
+  // Displaying results
+  printf("|         || %12s indirect || %12s direct ||          |\n", name.c_str(), name.c_str());
+  printf("|   m=n=k ||    ms    |   GFLOPS   ||    ms    |  GFLOPS  ||  score   | (lowest score == best switching point)\n");
+  printf("x---------xx----------x------------xx----------x----------xx----------x\n");
+  for (auto i = size_t{0}; i < indirect.size(); ++i) {
+    assert(indirect[i].first == direct[i].first);
+    const auto value = indirect[i].first;
+    if (indirect[i].second != -1 && direct[i].second != -1) {
+      const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6);
+      const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6);
+      printf("| %7zu || %8.2lf | %10.1lf || %8.2lf | %8.1lf || %8.3lf |\n",
+             value, indirect[i].second, gflops_indirect, direct[i].second, gflops_direct, scores[i].score);
+    }
+  }
+  printf("x---------xx----------x------------xx----------x----------xx----------x\n");
+  printf("\n");
+
+  const auto best_result = GetBestResult(scores);
+  const auto best_switching_point = best_result.config.at(parameter_name);
+  const auto best_string = parameter_name + "=" + ToString(best_switching_point);
+
+  // Outputs the results as JSON to disk, including some meta-data
+  const auto precision_string = std::to_string(static_cast<size_t>(precision));
+  auto metadata = std::vector<std::pair<std::string,std::string>>{
+      {"kernel_family", family_name},
+      {"precision", precision_string},
+      {"arg_from", ToString(from)},
+      {"arg_to", ToString(to)},
+      {"arg_step", ToString(step)},
+      {"best_kernel", best_result.name},
+      {"best_time", ToString(best_result.score)},
+      {"best_parameters", best_string}
+  };
+  PrintTimingsToFileAsJSON("clblast_" + family_name + "_" + precision_string + ".json",
+                           device, platform, metadata, scores);
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_
+#endif
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index 83db6104..0721ad7c 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -15,10 +15,9 @@
 #include <exception>
 #include <string>
 #include <vector>
-#include <assert.h>
 
 #include "utilities/utilities.hpp"
-#include "tuning/tuning.hpp"
+#include "tuning/routines/routine_tuner.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -40,15 +39,48 @@ void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Bu
   clReleaseEvent(event);
 }
 
-template <typename T>
-void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device) {
-  const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(),
-                                                  {{"XGEMM_MIN_INDIRECT_SIZE", minimum_size}});
-  if (override_status != StatusCode::kSuccess) {
-    throw RuntimeError("OverrideParameters failed with status " + ToString(override_status));
+template <typename T, size_t batch_count>
+void RunGemmBatchedRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
+  auto offsets = std::vector<size_t>(batch_count);
+  auto factors = std::vector<T>(batch_count);
+  for (auto i = size_t{0}; i < batch_count; ++i) {
+    offsets[i] = batch_count * value;
+    factors[i] = ConstantOne<T>();
   }
+  auto queue_plain = queue();
+  auto event = cl_event{};
+  auto status = GemmBatched(Layout::kRowMajor, Transpose::kNo, Transpose::kNo,
+                            value, value, value, factors.data(),
+                            buffers[0](), offsets.data(), value,
+                            buffers[1](), offsets.data(), value, factors.data(),
+                            buffers[2](), offsets.data(), value, batch_count,
+                            &queue_plain, &event);
+  if (status != StatusCode::kSuccess) {
+    throw RuntimeError("GemmBatched failed with status " + ToString(status));
+  }
+  clWaitForEvents(1, &event);
+  clReleaseEvent(event);
 }
 
+template <typename T, size_t batch_count>
+void RunGemmStridedBatchedRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
+  auto queue_plain = queue();
+  auto event = cl_event{};
+  auto status = GemmStridedBatched(Layout::kRowMajor, Transpose::kNo, Transpose::kNo,
+                                   value, value, value, ConstantOne<T>(),
+                                   buffers[0](), 0, value, value * value,
+                                   buffers[1](), 0, value, value * value, ConstantOne<T>(),
+                                   buffers[2](), 0, value, value * value, batch_count,
+                                   &queue_plain, &event);
+  if (status != StatusCode::kSuccess) {
+    throw RuntimeError("Gemm failed with status " + ToString(status));
+  }
+  clWaitForEvents(1, &event);
+  clReleaseEvent(event);
+}
+
+// =================================================================================================
+
 template <typename T>
 void TuneXgemm(int argc, char* argv[]) {
   auto command_line_args = RetrieveCommandLineArguments(argc, argv);
@@ -59,11 +91,6 @@ void TuneXgemm(int argc, char* argv[]) {
   const auto num_runs    = GetArgument(command_line_args, help, kArgNumRuns, size_t{10});
   fprintf(stdout, "%s\n", help.c_str());
 
-  // Values for m, n, and k
-  const auto from = size_t{64};
-  const auto to = size_t{2048};
-  const auto step = size_t{64};
-
   // OpenCL initialisation
   const auto platform = Platform(platform_id);
   const auto device = Device(platform, device_id);
@@ -74,84 +101,16 @@ void TuneXgemm(int argc, char* argv[]) {
   const auto context = Context(device);
   auto queue = Queue(context, device);
 
-  // Buffers
-  auto buffers = std::vector<Buffer<T>>{
-      Buffer<T>(context, to * to),
-      Buffer<T>(context, to * to),
-      Buffer<T>(context, to * to)
-  };
-
-  // In-direct version
-  printf("\n* Testing the in-direct GEMM routine for m=n=k\n");
-  ForceSelectIndirectFrom<T>(0, device);
-  const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
-
-  // Direct version
-  printf("\n* Testing the direct GEMM routine for m=n=k\n");
-  ForceSelectIndirectFrom<T>(to + 1, device);
-  const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
-
-  // Determining final score and best kernel selection point
-  assert(indirect.size() == direct.size());
-  printf("\n* Collecting results\n");
-  auto ratios = std::vector<double>(indirect.size());
-  for (auto i = size_t{0}; i < indirect.size(); ++i) {
-    ratios[i] = indirect[i].second / direct[i].second;
-  }
-  auto scores = std::vector<TuningResult>(ratios.size());
-  for (auto i = size_t{0}; i < scores.size(); ++i) {
-    auto score = 0;
-    for (auto j = size_t{0}; j < i; ++j) { score += (ratios[j] <= 1.0); }
-    for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); }
-    const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones
-    const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1);
-    auto tuning_results = Configuration();
-    tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = indirect[i].first;
-    tuning_results["PRECISION"] = static_cast<size_t>(precision);
-    scores[i] = TuningResult{
-        "gemm_kernel_selection",
-        (relative_score * relative_score) * 100 + epsilon,  // squared for proper default computation
-        tuning_results
-    };
-  }
-
-  // Displaying results
-  printf("|         ||   indirect GEMM   ||    direct GEMM    ||          |\n");
-  printf("|   m=n=k ||   ms   |  GFLOPS  ||   ms   |  GFLOPS  ||  score   | (lowest score == best switching point)\n");
-  printf("x---------xx--------x----------xx--------x----------xx----------x\n");
-  for (auto i = size_t{0}; i < indirect.size(); ++i) {
-    assert(indirect[i].first == direct[i].first);
-    const auto value = indirect[i].first;
-    if (indirect[i].second != -1 && direct[i].second != -1) {
-      const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6);
-      const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6);
-      printf("| %7zu || %6.2lf | %8.1lf || %6.2lf | %8.1lf || %8.3lf |\n",
-             value, indirect[i].second, gflops_indirect, direct[i].second, gflops_direct, scores[i].score);
-    }
-  }
-  printf("x---------xx--------x----------xx--------x----------xx----------x\n");
-  printf("\n");
-
-  // Computes the best switching point
-  auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
-  const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison);
-  const auto best_switching_point = best_configuration->config["XGEMM_MIN_INDIRECT_SIZE"];
-  const auto best_string = "XGEMM_MIN_INDIRECT_SIZE=" + ToString(best_switching_point);
-
-  // Outputs the results as JSON to disk, including some meta-data
-  const auto precision_string = std::to_string(static_cast<size_t>(precision));
-  auto metadata = std::vector<std::pair<std::string,std::string>>{
-      {"kernel_family", "gemm_routine"},
-      {"precision", precision_string},
-      {"arg_from", ToString(from)},
-      {"arg_to", ToString(to)},
-      {"arg_step", ToString(step)},
-      {"best_kernel", best_configuration->name},
-      {"best_time", ToString(best_configuration->score)},
-      {"best_parameters", best_string}
-  };
-  PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json",
-                           device, platform, metadata, scores);
+  // Run the tuners for the XGEMM routines
+  TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
+                         64, 2048, 64, 1, num_runs,
+                         "gemm", "GemmRoutine", "gemm_routine", "XGEMM_MIN_INDIRECT_SIZE");
+  //TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmBatchedRoutine<T, 30>,
+  //                       16, 128, 32, 30, num_runs,
+  //                       "gemmbatched", "GemmRoutine", "gemm_routine_2", "XGEMMBATCHED_MIN_INDIRECT_SIZE");
+  //TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmStridedBatchedRoutine<T, 30>,
+  //                       16, 128, 32, 30, num_runs,
+  //                       "gemmstridedbatched", "GemmRoutine", "gemm_routine_3", "XGEMMSTRIDEDBATCHED_MIN_INDIRECT_SIZE");
 
   printf("* Completed tuning process\n");
   printf("\n");
author	Cedric Nugteren <web@cedricnugteren.nl>	2018-01-20 10:19:28 +0100
committer	GitHub <noreply@github.com>	2018-01-20 10:19:28 +0100
commit	b2c946c517736b3e33a5a651a907c7fb10d646f6 (patch)
tree	8253935a3b251cda5ffaf6b1d7e78dde13610cc5
parent	b35e3d1e5326cdc257daa170eb243800616cfc26 (diff)
parent	c3f9371d16a66fa28906a3be9925a646e72ea471 (diff)