Integrated the GEMM routine tuner for kernel selection; added first tuning results

author: Cedric Nugteren <web@cedricnugteren.nl> 2017-11-02 21:47:14 +0100
committer: Cedric Nugteren <web@cedricnugteren.nl> 2017-11-02 21:47:14 +0100
commit: 9b0a435fb00b845b875590be90acffcd4f3bb009 (patch)
tree: 754b523789ef717619b540925c97e7167ba28f06
parent: 73272ab97dbd5abe757f6558c9b89665c5ac99d0 (diff)
18 files changed, 193 insertions, 154 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 14a6dd22..c565559f 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -8,6 +8,7 @@ Development (next version)
   * All correctness tests and performance clients work on CUDA like they did for OpenCL
 - Kernels are now cached based on their tuning parameters: fits the use-case of 'OverrideParameters'
 - Improved performance for small GEMM problems by going from 3 to 1 optional temporary buffers
+- GEMM kernel selection (direct vs in-direct) is now done automatically using a new tuner
 - Various minor fixes and enhancements
 - Added tuned parameters for various devices (see README)
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73b47637..a982d87d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,7 +192,9 @@ endif()
 set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
             xgemm xgemm_direct xgemv)
 set(DATABASES copy pad padtranspose transpose xaxpy xdot
-              xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger)
+              xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger
+              gemm_routine)
+set(ROUTINE_TUNERS xgemm)
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
                     xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
@@ -231,7 +233,6 @@ set(HEADERS  # such that they can be discovered by IDEs such as CLion and Visual
   src/database/apple_cpu_fallback.hpp
   src/database/database.hpp
   src/database/database_structure.hpp
-  src/database/kernel_selection.hpp
   src/routines/level1/xamin.hpp
   src/routines/level1/xmax.hpp
   src/routines/level1/xmin.hpp
@@ -377,7 +378,6 @@ if(TUNERS)
     target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
-  set(ROUTINE_TUNERS xgemm)
   foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
     add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
     target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
diff --git a/README.md b/README.md
index 0232c3f3..3070cc9c 100644
--- a/README.md
+++ b/README.md
@@ -196,6 +196,8 @@ In summary, tuning the entire library for your device can be done as follows (st
 
 Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. See the [API documentation](doc/clblast.md#overrideparameters-override-tuning-parameters-auxiliary-function) for more details.
 
+After the kernels are tuned, you can run the `clblast_tuner_routine_xgemm` tuner to optimize the high-level GEMM routine, i.e. selecting which method to use: the direct kernel or the in-direct kernel.
+
 
 Compiling the correctness tests (optional)
 -------------
diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
index 428bfdda..2b4f734c 100644
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@@ -23,7 +23,8 @@ DEVICE_TYPE_ATTRIBUTES = ["clblast_device_vendor", "clblast_device_type"]
 DEVICE_ATTRIBUTES = ["clblast_device_name", "clblast_device_architecture",
                      "device_core_clock", "device_compute_units"]
 KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
-ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
+ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta",
+                       "arg_from", "arg_to", "arg_step"]
 ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES
 GROUP_ATTRIBUTES = DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ["kernel"] + ARGUMENT_ATTRIBUTES
 
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 836c8803..2fa86151 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -30,10 +30,11 @@
 #include "database/kernels/transpose/transpose.hpp"
 #include "database/kernels/padtranspose/padtranspose.hpp"
 
+#include "database/kernels/gemm_routine/gemm_routine.hpp"
+
 #include "database/kernels/xtrsv.hpp"
 #include "database/kernels/invert.hpp"
 #include "database/apple_cpu_fallback.hpp"
-#include "database/kernel_selection.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -54,7 +55,7 @@ const std::vector<database::DatabaseEntry> Database::database = std::vector<data
   database::TransposeHalf, database::TransposeSingle, database::TransposeDouble, database::TransposeComplexSingle, database::TransposeComplexDouble,
   database::PadtransposeHalf, database::PadtransposeSingle, database::PadtransposeDouble, database::PadtransposeComplexSingle, database::PadtransposeComplexDouble,
   database::InvertHalf, database::InvertSingle, database::InvertDouble, database::InvertComplexSingle, database::InvertComplexDouble,
-  database::KernelSelectionHalf, database::KernelSelectionSingle, database::KernelSelectionDouble, database::KernelSelectionComplexSingle, database::KernelSelectionComplexDouble
+  database::GemmRoutineHalf, database::GemmRoutineSingle, database::GemmRoutineDouble, database::GemmRoutineComplexSingle, database::GemmRoutineComplexDouble
 };
 const std::vector<database::DatabaseEntry> Database::apple_cpu_fallback = std::vector<database::DatabaseEntry>{
   database::XaxpyApple, database::XdotApple,
diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp
deleted file mode 100644
index 6d74b9f9..00000000
--- a/src/database/kernel_selection.hpp
+++ /dev/null
@@ -1,136 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel
-// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the
-// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use
-// some common default values.
-//
-// =================================================================================================
-
-namespace clblast {
-namespace database {
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionHalf = {
-  "KernelSelection", Precision::kHalf, {"XGEMM_MIN_INDIRECT_SIZE"}, {
-    { // Intel GPUs
-      kDeviceTypeGPU, "Intel", {
-        { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionSingle = {
-  "KernelSelection", Precision::kSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
-    { // Intel GPUs
-      kDeviceTypeGPU, "Intel", {
-        { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { 
-      kDeviceTypeGPU, "ARM", {
-        { "default", { { kDeviceNameDefault, Params{ 128*128*128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    }, 
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionComplexSingle = {
-  "KernelSelection", Precision::kComplexSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
-    { // Intel GPUs
-      kDeviceTypeGPU, "Intel", {
-        { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionDouble = {
-  "KernelSelection", Precision::kDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
-    { // Intel GPUs
-      kDeviceTypeGPU, "Intel", {
-        { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-
-const DatabaseEntry KernelSelectionComplexDouble = {
-  "KernelSelection", Precision::kComplexDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
-    { // Intel GPUs
-      kDeviceTypeGPU, "Intel", {
-        { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-} // namespace database
-} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine.hpp b/src/database/kernels/gemm_routine/gemm_routine.hpp
new file mode 100644
index 00000000..f1470252
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine.hpp
@@ -0,0 +1,14 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine' kernels.
+//
+// =================================================================================================
+
+#include "database/kernels/gemm_routine/gemm_routine_16.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_32.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_3232.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_64.hpp"
+#include "database/kernels/gemm_routine/gemm_routine_6464.hpp"
diff --git a/src/database/kernels/gemm_routine/gemm_routine_16.hpp b/src/database/kernels/gemm_routine/gemm_routine_16.hpp
new file mode 100644
index 00000000..e17afe4b
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_16.hpp
@@ -0,0 +1,26 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine16' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineHalf = {
+  "GemmRoutine", Precision::kHalf, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+  }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_32.hpp b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
new file mode 100644
index 00000000..624de564
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
@@ -0,0 +1,34 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine32' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineSingle = {
+  "GemmRoutine", Precision::kSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default", {
+          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+  }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
new file mode 100644
index 00000000..689ae8d8
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
@@ -0,0 +1,34 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine3232' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineComplexSingle = {
+  "GemmRoutine", Precision::kComplexSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default", {
+          { Name{"Intel(R) HD Graphics Skylake ULT GT2              "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+  }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_64.hpp b/src/database/kernels/gemm_routine/gemm_routine_64.hpp
new file mode 100644
index 00000000..7fd29128
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_64.hpp
@@ -0,0 +1,26 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine64' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineDouble = {
+  "GemmRoutine", Precision::kDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+  }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/gemm_routine/gemm_routine_6464.hpp b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
new file mode 100644
index 00000000..85d2c8f1
--- /dev/null
+++ b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
@@ -0,0 +1,26 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
+// is auto-generated by the 'scripts/database/database.py' Python script.
+//
+// This file populates the database with best-found tuning parameters for the 'Gemm_Routine6464' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+
+const DatabaseEntry GemmRoutineComplexDouble = {
+  "GemmRoutine", Precision::kComplexDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default", {
+          { kDeviceNameDefault                                        , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
+      }
+    },
+  }
+};
+
+} // namespace database
+} // namespace clblast
diff --git a/src/routine.cpp b/src/routine.cpp
index 0f9fe360..48273eac 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -43,7 +43,7 @@ const std::unordered_map<std::string, const std::vector<std::string>> Routine::r
   {"Padtranspose", routines_gemm_syrk},
   {"Xgemm", routines_gemm_syrk},
   {"XgemmDirect", routines_gemm},
-  {"KernelSelection", routines_gemm},
+  {"GemmRoutine", routines_gemm},
   {"Invert", routines_trsm},
 };
 // =================================================================================================
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index a0063ee2..94392dd0 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -23,7 +23,7 @@ namespace clblast {
 template <typename T>
 Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
     Routine(queue, event, name,
-            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"},
             PrecisionValue<T>(), {}, {
     #include "../../kernels/level3/level3.opencl"
     #include "../../kernels/level3/copy_fast.opencl"
@@ -104,7 +104,9 @@ void Xgemm<T>::DoGemm(const Layout layout,
   // Selects which version of GEMM to run
   const auto m_n_k = static_cast<unsigned long long>(m) * static_cast<unsigned long long>(n) *
                      static_cast<unsigned long long>(k);
-  const auto do_gemm_direct = (m_n_k < static_cast<unsigned long long>(db_["XGEMM_MIN_INDIRECT_SIZE"]));
+  const auto database_value = static_cast<unsigned long long>(db_["XGEMM_MIN_INDIRECT_SIZE"]);
+  const auto min_indirect_size = database_value * database_value * database_value;
+  const auto do_gemm_direct = (m_n_k < min_indirect_size);
   if (do_gemm_direct) { // for small sizes (single kernel)
     GemmDirect(m, n, k, alpha,
                a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
index 8a015e97..152e7194 100644
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -23,7 +23,7 @@ namespace clblast {
 template <typename T>
 XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::string &name):
     Routine(queue, event, name,
-            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"},
             PrecisionValue<T>(), {}, {
     #include "../../kernels/level3/level3.opencl"
     #include "../../kernels/level3/copy_fast.opencl"
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index 1ccaa0ca..f45e8635 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -42,7 +42,7 @@ void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Bu
 
 template <typename T>
 void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device) {
-  const auto override_status = OverrideParameters(device(), "KernelSelection", PrecisionValue<T>(),
+  const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(),
                                                   {{"XGEMM_MIN_INDIRECT_SIZE", minimum_size}});
   if (override_status != StatusCode::kSuccess) {
     throw RuntimeError("OverrideParameters failed with status " + ToString(override_status));
@@ -61,7 +61,7 @@ void TuneXgemm(int argc, char* argv[]) {
 
   // Values for m, n, and k
   const auto from = size_t{64};
-  const auto to = size_t{1024};
+  const auto to = size_t{2048};
   const auto step = size_t{64};
 
   // OpenCL initialisation
@@ -106,7 +106,10 @@ void TuneXgemm(int argc, char* argv[]) {
     scores[i] = TuningResult{
         "gemm_kernel_selection",
         static_cast<double>(score) / static_cast<double>(scores.size() - 1) + epsilon,
-        TuningParameters{TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first}}
+        TuningParameters{
+            TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first},
+            TuningParameter{"PRECISION", static_cast<size_t>(precision)}
+        }
     };
   }
 
@@ -126,11 +129,15 @@ void TuneXgemm(int argc, char* argv[]) {
   const auto precision_string = std::to_string(static_cast<size_t>(precision));
   auto metadata = std::vector<std::pair<std::string,std::string>>{
       {"kernel_family", "gemm_routine"},
+      {"arg_from", ToString(from)},
+      {"arg_to", ToString(to)},
+      {"arg_step", ToString(step)},
       {"precision", precision_string},
   };
   PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json",
                            device, platform, metadata, scores);
 
+  printf("[  STATUS  ] All done\n");
 }
 
 // =================================================================================================
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index 423e6e2b..bfad6147 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -73,16 +73,17 @@ void PrintTimingsToFileAsJSON(const std::string &filename,
                               const Device& device, const Platform& platform,
                               const std::vector<std::pair<std::string,std::string>> &metadata,
                               const std::vector<TuningResult>& tuning_results) {
+  printf("[  STATUS  ] Writing results to '%s'\n", filename.c_str());
   auto file = fopen(filename.c_str(), "w");
   fprintf(file, "{\n");
   for (auto &datum: metadata) {
     fprintf(file, "  \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
   }
   fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
-  fprintf(file, "  \"device_name\": \"%s\",\n", GetDeviceName(device).c_str());
-  fprintf(file, "  \"device_vendor\": \"%s\",\n", platform.Vendor().c_str());
-  fprintf(file, "  \"device_type\": \"%s\",\n", device.Type().c_str());
-  fprintf(file, "  \"device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
+  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
+  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
+  fprintf(file, "  \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
+  fprintf(file, "  \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
   fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
   fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
   fprintf(file, "  \"results\": [\n");
diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp
index 8444c1c3..fe8cf7b9 100644
--- a/test/routines/level3/xgemm.hpp
+++ b/test/routines/level3/xgemm.hpp
@@ -86,7 +86,7 @@ class TestXgemm {
     if (V != 0) {
       const auto device = queue.GetDevice();
       const auto switch_threshold = (V == 1) ? size_t{0} : size_t{1024 * 1024 * 1024}; // large enough for tests
-      const auto override_status = OverrideParameters(device(), "KernelSelection", PrecisionValue<T>(),
+      const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(),
                                                       {{"XGEMM_MIN_INDIRECT_SIZE", switch_threshold}});
       if (override_status != StatusCode::kSuccess) { return override_status; }
     }
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-11-02 21:47:14 +0100
committer	Cedric Nugteren <web@cedricnugteren.nl>	2017-11-02 21:47:14 +0100
commit	9b0a435fb00b845b875590be90acffcd4f3bb009 (patch)
tree	754b523789ef717619b540925c97e7167ba28f06
parent	73272ab97dbd5abe757f6558c9b89665c5ac99d0 (diff)