From bd57dfa435dd6c161b758aef2c68404f837ed689 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 28 Oct 2017 14:12:05 +0200
Subject: Moved timing function to a separate file

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

(limited to 'CMakeLists.txt')
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 610e5149..d3b202c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -242,6 +242,7 @@ set(HEADERS  # such that they can be discovered by IDEs such as CLion and Visual
   src/utilities/clblast_exceptions.hpp
   src/utilities/device_mapping.hpp
   src/utilities/msvc.hpp
+  src/utilities/timing.hpp
   src/utilities/utilities.hpp
   src/cache.hpp
   src/cxpp11_common.hpp
-- 
cgit v1.2.3


From 334a26eb12a10b597312db2a1b7de1548cba6327 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 28 Oct 2017 17:30:29 +0200
Subject: Added initial version of a GEMM kernel selection tuner

---
 CMakeLists.txt                |   7 +++
 src/tuning/routines/xgemm.cpp | 136 ++++++++++++++++++++++++++++++++++++++++++
 src/utilities/timing.hpp      |  52 ++++++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 src/tuning/routines/xgemm.cpp

(limited to 'CMakeLists.txt')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d3b202c2..73b47637 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -377,6 +377,13 @@ if(TUNERS)
     target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
+  set(ROUTINE_TUNERS xgemm)
+  foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
+    add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
+    target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
+    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC ${CLTUNE_INCLUDE_DIRS})
+    install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
+  endforeach()
 
   # Adds 'alltuners' target: runs all tuners for all precisions
   set(ALLTUNERS )
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
new file mode 100644
index 00000000..9590323a
--- /dev/null
+++ b/src/tuning/routines/xgemm.cpp
@@ -0,0 +1,136 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file tunes the Xgemm routine at a high-level: choosing between the direct (single-kernel)
+// and the in-direct (kernel plus pre/post-processing) methods.
+//
+// =================================================================================================
+
+#include <exception>
+#include <string>
+#include <vector>
+#include <assert.h>
+
+#include "utilities/utilities.hpp"
+#include "utilities/timing.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
+  auto queue_plain = queue();
+  auto event = cl_event{};
+  auto status = Gemm(Layout::kRowMajor, Transpose::kNo, Transpose::kNo,
+                     value, value, value, ConstantOne<T>(),
+                     buffers[0](), 0, value,
+                     buffers[1](), 0, value, ConstantOne<T>(),
+                     buffers[2](), 0, value,
+                     &queue_plain, &event);
+  if (status != StatusCode::kSuccess) {
+    throw RuntimeError("Gemm failed with status " + ToString(status));
+  }
+  clWaitForEvents(1, &event);
+  clReleaseEvent(event);
+}
+
+template <typename T>
+void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device) {
+  const auto override_status = OverrideParameters(device(), "KernelSelection", PrecisionValue<T>(),
+                                                  {{"XGEMM_MIN_INDIRECT_SIZE", minimum_size}});
+  if (override_status != StatusCode::kSuccess) {
+    throw RuntimeError("OverrideParameters failed with status " + ToString(override_status));
+  }
+}
+
+template <typename T>
+void TuneXgemm(int argc, char* argv[]) {
+  auto command_line_args = RetrieveCommandLineArguments(argc, argv);
+  auto help = std::string{"* Options given/available:\n"};
+  const auto platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}));
+  const auto device_id   = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
+  const auto precision   = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle);
+  const auto num_runs    = GetArgument(command_line_args, help, kArgNumRuns, size_t{10});
+  fprintf(stdout, "%s\n", help.c_str());
+
+  // Values for m, n, and k
+  const auto from = size_t{64};
+  const auto to = size_t{1024};
+  const auto step = size_t{64};
+
+  // OpenCL initialisation
+  const auto platform = Platform(platform_id);
+  const auto device = Device(platform, device_id);
+  if (!PrecisionSupported<T>(device)) {
+    printf("* Unsupported precision, skipping this tuning run\n\n");
+    return;
+  }
+  const auto context = Context(device);
+  const auto queue = Queue(context, device);
+
+  // Buffers
+  auto a_mat = Buffer<T>(context, to * to);
+  auto b_mat = Buffer<T>(context, to * to);
+  auto c_mat = Buffer<T>(context, to * to);
+  auto buffers = std::vector<Buffer<T>>{a_mat, b_mat, c_mat};
+
+  // In-direct version
+  printf("[----------] Testing the in-direct GEMM routine for m=n=k\n");
+  ForceSelectIndirectFrom<T>(0, device);
+  const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
+
+  // Direct version
+  printf("[----------] Testing the direct GEMM routine for m=n=k\n");
+  ForceSelectIndirectFrom<T>(to * to * to + 1, device);
+  const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
+
+  // Results
+  printf("[----------] Collecting results\n");
+  assert(indirect.size() == direct.size());
+  for (auto i = size_t{0}; i < indirect.size(); ++i) {
+    assert(indirect[i].first == direct[i].first);
+    const auto value = indirect[i].first;
+    const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6);
+    const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6);
+    printf("[ -------> ] %7zu %8.2lf %8.2lf\n", value, gflops_indirect, gflops_direct);
+  }
+
+  // Outputs the results as JSON to disk, including some meta-data
+  const auto precision_string = std::to_string(static_cast<size_t>(precision));
+  auto metadata = std::vector<std::pair<std::string,std::string>>{
+      {"kernel_family", "gemm_routine"},
+      {"precision", precision_string},
+  };
+  PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json",
+                           device, platform, metadata);
+
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using half = clblast::half;
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
+  switch(clblast::GetPrecision(command_line_args)) {
+    case clblast::Precision::kHalf: clblast::TuneXgemm<half>(argc, argv); break;
+    case clblast::Precision::kSingle: clblast::TuneXgemm<float>(argc, argv); break;
+    case clblast::Precision::kDouble: clblast::TuneXgemm<double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle: clblast::TuneXgemm<float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble: clblast::TuneXgemm<double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index 3d66de2a..4622aa99 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -14,14 +14,20 @@
 #ifndef CLBLAST_TIMING_H_
 #define CLBLAST_TIMING_H_
 
+#include <cstdio>
+#include <utility>
 #include <vector>
+#include <algorithm>
 #include <chrono>
 
+#include "utilities/utilities.hpp"
+
 namespace clblast {
 // =================================================================================================
 
 template <typename F>
 double TimeFunction(const size_t num_runs, F const &function) {
+  function(); // warm-up
   auto timings = std::vector<double>(num_runs);
   for (auto &timing: timings) {
     const auto start_time = std::chrono::steady_clock::now();
@@ -32,6 +38,52 @@ double TimeFunction(const size_t num_runs, F const &function) {
   return *std::min_element(timings.begin(), timings.end());
 }
 
+// =================================================================================================
+
+using Timing = std::pair<size_t, double>;
+
+template <typename T, typename F>
+std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t step,
+                                const size_t num_runs, const Queue& queue,
+                                const std::vector<Buffer<T>>& buffers, F const &routine) {
+  auto timings = std::vector<Timing>();
+  for (auto value = from; value < to; value += step) {
+    printf("[ RUN      ] Running with value %zu\n", value);
+    try {
+      const auto FunctionToTune = [&]() { routine(value, queue, buffers); };
+      const auto time_ms = TimeFunction(num_runs, FunctionToTune);
+      printf("[       OK ] Took %.2lf ms\n", time_ms);
+      timings.push_back({value, time_ms});
+    }
+    catch (...) {
+      printf("[    ERROR ] Exception caught\n");
+      timings.push_back({value, -1.0}); // invalid
+    }
+  }
+  return timings;
+}
+
+// =================================================================================================
+
+void PrintTimingsToFileAsJSON(const std::string &filename,
+                              const Device& device, const Platform& platform,
+                              const std::vector<std::pair<std::string,std::string>> &descriptions) {
+  auto file = fopen(filename.c_str(), "w");
+  fprintf(file, "{\n");
+  for (auto &description: descriptions) {
+    fprintf(file, "  \"%s\": \"%s\",\n", description.first.c_str(), description.second.c_str());
+  }
+  fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
+  fprintf(file, "  \"device_name\": \"%s\",\n", GetDeviceName(device).c_str());
+  fprintf(file, "  \"device_vendor\": \"%s\",\n", platform.Vendor().c_str());
+  fprintf(file, "  \"device_type\": \"%s\",\n", device.Type().c_str());
+  fprintf(file, "  \"device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
+  fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
+  fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
+  fprintf(file, "}\n");
+  fclose(file);
+}
+
 // =================================================================================================
 } // namespace clblast
 
-- 
cgit v1.2.3


From 9b0a435fb00b845b875590be90acffcd4f3bb009 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Thu, 2 Nov 2017 21:47:14 +0100
Subject: Integrated the GEMM routine tuner for kernel selection; added first
 tuning results

---
 CHANGELOG                                          |   1 +
 CMakeLists.txt                                     |   6 +-
 README.md                                          |   2 +
 scripts/database/database/clblast.py               |   3 +-
 src/database/database.cpp                          |   5 +-
 src/database/kernel_selection.hpp                  | 136 ---------------------
 src/database/kernels/gemm_routine/gemm_routine.hpp |  14 +++
 .../kernels/gemm_routine/gemm_routine_16.hpp       |  26 ++++
 .../kernels/gemm_routine/gemm_routine_32.hpp       |  34 ++++++
 .../kernels/gemm_routine/gemm_routine_3232.hpp     |  34 ++++++
 .../kernels/gemm_routine/gemm_routine_64.hpp       |  26 ++++
 .../kernels/gemm_routine/gemm_routine_6464.hpp     |  26 ++++
 src/routine.cpp                                    |   2 +-
 src/routines/level3/xgemm.cpp                      |   6 +-
 src/routines/levelx/xgemmbatched.cpp               |   2 +-
 src/tuning/routines/xgemm.cpp                      |  13 +-
 src/utilities/timing.hpp                           |   9 +-
 test/routines/level3/xgemm.hpp                     |   2 +-
 18 files changed, 193 insertions(+), 154 deletions(-)
 delete mode 100644 src/database/kernel_selection.hpp
 create mode 100644 src/database/kernels/gemm_routine/gemm_routine.hpp
 create mode 100644 src/database/kernels/gemm_routine/gemm_routine_16.hpp
 create mode 100644 src/database/kernels/gemm_routine/gemm_routine_32.hpp
 create mode 100644 src/database/kernels/gemm_routine/gemm_routine_3232.hpp
 create mode 100644 src/database/kernels/gemm_routine/gemm_routine_64.hpp
 create mode 100644 src/database/kernels/gemm_routine/gemm_routine_6464.hpp

(limited to 'CMakeLists.txt')

diff --git a/CHANGELOG b/CHANGELOG
index 14a6dd22..c565559f 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -8,6 +8,7 @@ Development (next version)
   * All correctness tests and performance clients work on CUDA like they did for OpenCL
 - Kernels are now cached based on their tuning parameters: fits the use-case of 'OverrideParameters'
 - Improved performance for small GEMM problems by going from 3 to 1 optional temporary buffers
+- GEMM kernel selection (direct vs in-direct) is now done automatically using a new tuner
 - Various minor fixes and enhancements
 - Added tuned parameters for various devices (see README)
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73b47637..a982d87d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,7 +192,9 @@ endif()
 set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
             xgemm xgemm_direct xgemv)
 set(DATABASES copy pad padtranspose transpose xaxpy xdot
-              xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger)
+              xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger
+              gemm_routine)
+set(ROUTINE_TUNERS xgemm)
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
                     xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
@@ -231,7 +233,6 @@ set(HEADERS  # such that they can be discovered by IDEs such as CLion and Visual
   src/database/apple_cpu_fallback.hpp
   src/database/database.hpp
   src/database/database_structure.hpp
-  src/database/kernel_selection.hpp
   src/routines/level1/xamin.hpp
   src/routines/level1/xmax.hpp
   src/routines/level1/xmin.hpp
@@ -377,7 +378,6 @@ if(TUNERS)
     target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
-  set(ROUTINE_TUNERS xgemm)
   foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
     add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
     target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
diff --git a/README.md b/README.md
index 0232c3f3..3070cc9c 100644
--- a/README.md
+++ b/README.md
@@ -196,6 +196,8 @@ In summary, tuning the entire library for your device can be done as follows (st
 
 Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. See the [API documentation](doc/clblast.md#overrideparameters-override-tuning-parameters-auxiliary-function) for more details.
 
+After the kernels are tuned, you can run the `clblast_tuner_routine_xgemm` tuner to optimize the high-level GEMM routine, i.e. selecting which method to use: the direct kernel or the in-direct kernel.
+
 
 Compiling the correctness tests (optional)
 -------------
diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
index 428bfdda..2b4f734c 100644
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@@ -23,7 +23,8 @@ DEVICE_TYPE_ATTRIBUTES = ["clblast_device_vendor", "clblast_device_type"]
 DEVICE_ATTRIBUTES = ["clblast_device_name", "clblast_device_architecture",
                      "device_core_clock", "device_compute_units"]
 KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
-ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
+ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta",
+                       "arg_from", "arg_to", "arg_step"]
 ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES
 GROUP_ATTRIBUTES = DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ["kernel"] + ARGUMENT_ATTRIBUTES
 
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 836c8803..2fa86151 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -30,10 +30,11 @@
 #include "database/kernels/transpose/transpose.hpp"
 #include "database/kernels/padtranspose/padtranspose.hpp"
 
+#include "database/kernels/gemm_routine/gemm_routine.hpp"
+
 #include "database/kernels/xtrsv.hpp"
 #include "database/kernels/invert.hpp"
 #include "database/apple_cpu_fallback.hpp"
-#include "database/kernel_selection.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -54,7 +55,7 @@ const std::vector<database::DatabaseEntry> Database::database = std::vector<data
   database::TransposeHalf, database::TransposeSingle, database::TransposeDouble, database::TransposeComplexSingle, database::TransposeComplexDouble,
   database::PadtransposeHalf, database::PadtransposeSingle, database::PadtransposeDouble, database::PadtransposeComplexSingle, database::PadtransposeComplexDouble,
   database::InvertHalf, database::InvertSingle, database::InvertDouble, database::InvertComplexSingle, database::InvertComplexDouble,
-  database::KernelSelectionHalf, database::KernelSelectionSingle, database::KernelSelectionDouble, database::KernelSelectionComplexSingle, database::KernelSelectionComplexDouble
+  database::GemmRoutineHalf, database::GemmRoutineSingle, database::GemmRoutineDouble, database::GemmRoutineComplexSingle, database::GemmRoutineComplexDouble
 };
 const std::vector<database::DatabaseEntry> Database::apple_cpu_fallback = std::vector<database::DatabaseEntry>{
   database::XaxpyApple, database::XdotApple,
diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp
deleted file mode 100644
index 6d74b9f9..00000000
--- a/src/database/kernel_selection.hpp
+++ /dev/null
@@ -1,136 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel
-// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the
-// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use
-// some common default values.
-//
-// =================================================================================================
-
-namespace clblast {
-namespace database {
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionHalf = {
-  "KernelSelection", Precision::kHalf, {"XGEMM_MIN_INDIRECT_SIZE"}, {
-    { // Intel GPUs
-      kDeviceTypeGPU, "Intel", {
-        { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionSingle = {
-  "KernelSelection", Precision::kSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
-    { // Intel GPUs
-      kDeviceTypeGPU, "Intel", {
-        { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { 
-      kDeviceTypeGPU, "ARM", {
-        { "default", { { kDeviceNameDefault, Params{ 128*128*128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    }, 
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionComplexSingle = {
-  "KernelSelection", Precision::kComplexSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
-    { // Intel GPUs
-      kDeviceTypeGPU, "Intel", {
-        { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionDouble = {
-  "KernelSelection", Precision::kDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
-    { // Intel GPUs
-      kDeviceTypeGPU, "Intel", {
-        { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionComplexDouble = {
-  "KernelSelection", Precision::kComplexDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
-    { // Intel GPUs
-      kDeviceTypeGPU, "Intel", {
-        { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-} // namespace database
-} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine.hpp b/src/database/kernels/gemm_routine/gemm_routine.hpp
new file mode 100644
index 00000000..f1470252
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine.hpp
@@ -0,0 +1,14 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine' kernels.
+//
+// =================================================================================================
+
+#include "database/kernels/gemm_routine/gemm_routine_16.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_32.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_3232.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_64.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_6464.hpp"
diff --git a/src/database/kernels/gemm_routine/gemm_routine_16.hpp b/src/database/kernels/gemm_routine/gemm_routine_16.hpp
new file mode 100644
index 00000000..e17afe4b
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_16.hpp
@@ -0,0 +1,26 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine16' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineHalf = {
+  "GemmRoutine", Precision::kHalf, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+  }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_32.hpp b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
new file mode 100644
index 00000000..624de564
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
@@ -0,0 +1,34 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine32' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineSingle = {
+  "GemmRoutine", Precision::kSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default", {
+          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+  }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
new file mode 100644
index 00000000..689ae8d8
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
@@ -0,0 +1,34 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine3232' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineComplexSingle = {
+  "GemmRoutine", Precision::kComplexSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default", {
+          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+  }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_64.hpp b/src/database/kernels/gemm_routine/gemm_routine_64.hpp
new file mode 100644
index 00000000..7fd29128
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_64.hpp
@@ -0,0 +1,26 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine64' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineDouble = {
+  "GemmRoutine", Precision::kDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+  }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_6464.hpp b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
new file mode 100644
index 00000000..85d2c8f1
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
@@ -0,0 +1,26 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine6464' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineComplexDouble = {
+  "GemmRoutine", Precision::kComplexDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+  }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/routine.cpp b/src/routine.cpp
index 0f9fe360..48273eac 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -43,7 +43,7 @@ const std::unordered_map<std::string, const std::vector<std::string>> Routine::r
   {"Padtranspose", routines_gemm_syrk},
   {"Xgemm", routines_gemm_syrk},
   {"XgemmDirect", routines_gemm},
-  {"KernelSelection", routines_gemm},
+  {"GemmRoutine", routines_gemm},
   {"Invert", routines_trsm},
 };
 // =================================================================================================
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index a0063ee2..94392dd0 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -23,7 +23,7 @@ namespace clblast {
 template <typename T>
 Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
     Routine(queue, event, name,
-            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"},
             PrecisionValue<T>(), {}, {
     #include "../../kernels/level3/level3.opencl"
     #include "../../kernels/level3/copy_fast.opencl"
@@ -104,7 +104,9 @@ void Xgemm<T>::DoGemm(const Layout layout,
   // Selects which version of GEMM to run
   const auto m_n_k = static_cast<unsigned long long>(m) * static_cast<unsigned long long>(n) *
                      static_cast<unsigned long long>(k);
-  const auto do_gemm_direct = (m_n_k < static_cast<unsigned long long>(db_["XGEMM_MIN_INDIRECT_SIZE"]));
+  const auto database_value = static_cast<unsigned long long>(db_["XGEMM_MIN_INDIRECT_SIZE"]);
+  const auto min_indirect_size = database_value * database_value * database_value;
+  const auto do_gemm_direct = (m_n_k < min_indirect_size);
   if (do_gemm_direct) { // for small sizes (single kernel)
     GemmDirect(m, n, k, alpha,
                a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
index 8a015e97..152e7194 100644
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -23,7 +23,7 @@ namespace clblast {
 template <typename T>
 XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::string &name):
     Routine(queue, event, name,
-            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"},
             PrecisionValue<T>(), {}, {
     #include "../../kernels/level3/level3.opencl"
     #include "../../kernels/level3/copy_fast.opencl"
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index 1ccaa0ca..f45e8635 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -42,7 +42,7 @@ void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Bu
 
 template <typename T>
 void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device) {
-  const auto override_status = OverrideParameters(device(), "KernelSelection", PrecisionValue<T>(),
+  const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(),
                                                   {{"XGEMM_MIN_INDIRECT_SIZE", minimum_size}});
   if (override_status != StatusCode::kSuccess) {
     throw RuntimeError("OverrideParameters failed with status " + ToString(override_status));
@@ -61,7 +61,7 @@ void TuneXgemm(int argc, char* argv[]) {
 
   // Values for m, n, and k
   const auto from = size_t{64};
-  const auto to = size_t{1024};
+  const auto to = size_t{2048};
   const auto step = size_t{64};
 
   // OpenCL initialisation
@@ -106,7 +106,10 @@ void TuneXgemm(int argc, char* argv[]) {
     scores[i] = TuningResult{
         "gemm_kernel_selection",
         static_cast<double>(score) / static_cast<double>(scores.size() - 1) + epsilon,
-        TuningParameters{TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first}}
+        TuningParameters{
+            TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first},
+            TuningParameter{"PRECISION", static_cast<size_t>(precision)}
+        }
     };
   }
 
@@ -126,11 +129,15 @@ void TuneXgemm(int argc, char* argv[]) {
   const auto precision_string = std::to_string(static_cast<size_t>(precision));
   auto metadata = std::vector<std::pair<std::string,std::string>>{
       {"kernel_family", "gemm_routine"},
+      {"arg_from", ToString(from)},
+      {"arg_to", ToString(to)},
+      {"arg_step", ToString(step)},
       {"precision", precision_string},
   };
   PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json",
                            device, platform, metadata, scores);
 
+  printf("[  STATUS  ] All done\n");
 }
 
 // =================================================================================================
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index 423e6e2b..bfad6147 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -73,16 +73,17 @@ void PrintTimingsToFileAsJSON(const std::string &filename,
                               const Device& device, const Platform& platform,
                               const std::vector<std::pair<std::string,std::string>> &metadata,
                               const std::vector<TuningResult>& tuning_results) {
+  printf("[  STATUS  ] Writing results to '%s'\n", filename.c_str());
   auto file = fopen(filename.c_str(), "w");
   fprintf(file, "{\n");
   for (auto &datum: metadata) {
     fprintf(file, "  \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
   }
   fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
-  fprintf(file, "  \"device_name\": \"%s\",\n", GetDeviceName(device).c_str());
-  fprintf(file, "  \"device_vendor\": \"%s\",\n", platform.Vendor().c_str());
-  fprintf(file, "  \"device_type\": \"%s\",\n", device.Type().c_str());
-  fprintf(file, "  \"device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
+  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
+  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
+  fprintf(file, "  \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
+  fprintf(file, "  \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
   fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
   fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
   fprintf(file, "  \"results\": [\n");
diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp
index 8444c1c3..fe8cf7b9 100644
--- a/test/routines/level3/xgemm.hpp
+++ b/test/routines/level3/xgemm.hpp
@@ -86,7 +86,7 @@ class TestXgemm {
     if (V != 0) {
       const auto device = queue.GetDevice();
       const auto switch_threshold = (V == 1) ? size_t{0} : size_t{1024 * 1024 * 1024}; // large enough for tests
-      const auto override_status = OverrideParameters(device(), "KernelSelection", PrecisionValue<T>(),
+      const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(),
                                                       {{"XGEMM_MIN_INDIRECT_SIZE", switch_threshold}});
       if (override_status != StatusCode::kSuccess) { return override_status; }
     }
-- 
cgit v1.2.3