From 677afd3b96b2cbd3d2aae77e90cab87d2cc1eaa2 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 11 Nov 2017 16:14:43 +0100
Subject: Factored out the creation of the OpenCL header and the program
 compilation

---
 src/routine.cpp         | 68 +++---------------------------------------
 src/routines/common.cpp | 78 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/routines/common.hpp |  6 ++++
 3 files changed, 88 insertions(+), 64 deletions(-)
diff --git a/src/routine.cpp b/src/routine.cpp
index 81201eea..93882fbf 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -135,74 +135,21 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
     throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
   }
 
-  // Collects the parameters for this device in the form of defines, and adds the precision
+  // Collects the parameters for this device in the form of defines
   auto source_string = std::string{""};
   for (const auto &kernel_name : kernel_names_) {
     source_string += db_(kernel_name).GetDefines();
   }
-  source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
-
-  // Adds the name of the routine as a define
-  source_string += "#define ROUTINE_"+routine_name_+"\n";
-
-  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
-  // which it is known to work with all OpenCL platforms.
-  if (device_.IsNVIDIA() || device_.IsARM()) {
-    source_string += "#define USE_INLINE_KEYWORD 1\n";
-  }
-
-  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
-  // performance, but might result in a reduced accuracy.
-  if (device_.IsAMD() && device_.IsGPU()) {
-    source_string += "#define USE_CL_MAD 1\n";
-  }
-
-  // For specific devices, use staggered/shuffled workgroup indices.
-  if (device_.IsAMD() && device_.IsGPU()) {
-    source_string += "#define USE_STAGGERED_INDICES 1\n";
-  }
-
-  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
-  // performance through better cache behaviour
-  if (device_.IsARM() && device_.IsGPU()) {
-    source_string += "#define GLOBAL_MEM_FENCE 1\n";
-  }
-
-  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
-  #ifdef CUDA_API
-    source_string +=
-      #include "kernels/opencl_to_cuda.h"
-    ;
-  #endif
-
-  // Loads the common header (typedefs and defines and such)
-  source_string +=
-    #include "kernels/common.opencl"
-  ;
 
   // Adds routine-specific code to the constructed source string
   for (const char *s: source) {
     source_string += s;
   }
 
-  // Prints details of the routine to compile in case of debugging in verbose mode
-  #ifdef VERBOSE
-    printf("[DEBUG] Compiling routine '%s-%s' for device '%s'\n",
-           routine_name_.c_str(), ToString(precision_).c_str(), device_name.c_str());
-    const auto start_time = std::chrono::steady_clock::now();
-  #endif
+  // Completes the source and compiles the kernel
+  program_ = CompileFromSource(source_string, precision_, routine_name_,
+                               device_, context_, options);
 
-  // Compiles the kernel
-  program_ = Program(context_, source_string);
-  try {
-    program_.Build(device_, options);
-  } catch (const CLCudaAPIBuildError &e) {
-    if (program_.StatusIsCompilationWarningOrError(e.status())) {
-      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
-              program_.GetBuildInfo(device_).c_str());
-    }
-    throw;
-  }
 
   // Store the compiled binary and program in the cache
   BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name},
@@ -210,13 +157,6 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
 
   ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info},
                                  Program{ program_ });
-
-  // Prints the elapsed compilation time in case of debugging in verbose mode
-  #ifdef VERBOSE
-    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
-    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
-    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
-  #endif
 }
 
 // =================================================================================================
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index 5b178e53..c415d9fd 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -19,6 +19,84 @@
 namespace clblast {
 // =================================================================================================
 
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+                          const std::string &routine_name,
+                          const Device& device, const Context& context,
+                          std::vector<std::string>& options) {
+  auto header_string = std::string{""};
+
+  header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
+
+  // Adds the name of the routine as a define
+  header_string += "#define ROUTINE_" + routine_name + "\n";
+
+  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
+  // which it is known to work with all OpenCL platforms.
+  if (device.IsNVIDIA() || device.IsARM()) {
+    header_string += "#define USE_INLINE_KEYWORD 1\n";
+  }
+
+  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
+  // performance, but might result in a reduced accuracy.
+  if (device.IsAMD() && device.IsGPU()) {
+    header_string += "#define USE_CL_MAD 1\n";
+  }
+
+  // For specific devices, use staggered/shuffled workgroup indices.
+  if (device.IsAMD() && device.IsGPU()) {
+    header_string += "#define USE_STAGGERED_INDICES 1\n";
+  }
+
+  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+  // performance through better cache behaviour
+  if (device.IsARM() && device.IsGPU()) {
+    header_string += "#define GLOBAL_MEM_FENCE 1\n";
+  }
+
+  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
+  #ifdef CUDA_API
+    source_string +=
+      #include "kernels/opencl_to_cuda.h"
+    ;
+  #endif
+
+  // Loads the common header (typedefs and defines and such)
+  header_string +=
+    #include "kernels/common.opencl"
+  ;
+
+  // Prints details of the routine to compile in case of debugging in verbose mode
+  #ifdef VERBOSE
+    printf("[DEBUG] Compiling routine '%s-%s'\n",
+           routine_name.c_str(), ToString(precision).c_str());
+    const auto start_time = std::chrono::steady_clock::now();
+  #endif
+
+  // Compiles the kernel
+  auto program = Program(context, header_string + source_string);
+  try {
+    program.Build(device, options);
+  } catch (const CLCudaAPIBuildError &e) {
+    if (program.StatusIsCompilationWarningOrError(e.status())) {
+      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
+              program.GetBuildInfo(device).c_str());
+    }
+    throw;
+  }
+
+  // Prints the elapsed compilation time in case of debugging in verbose mode
+  #ifdef VERBOSE
+    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
+  #endif
+
+  return program;
+}
+
+// =================================================================================================
+
 // Enqueues a kernel, waits for completion, and checks for errors
 void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                std::vector<size_t> global, const std::vector<size_t> &local,
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index bf3b1762..8a93d74a 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -25,6 +25,12 @@
 namespace clblast {
 // =================================================================================================
 
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+                          const std::string &routine_name,
+                          const Device& device, const Context& context,
+                          std::vector<std::string>& options);
+
 // Enqueues a kernel, waits for completion, and checks for errors
 void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                std::vector<size_t> global, const std::vector<size_t> &local,
-- 
cgit v1.2.3


From 4bac1287f2d49bece72822bf6032e4da56a2dd2d Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Mon, 13 Nov 2017 21:10:44 +0100
Subject: Moved square-difference utility function for use in the tuners

---
 src/utilities/utilities.cpp | 31 +++++++++++++++++++++++++++++++
 src/utilities/utilities.hpp |  6 ++++++
 test/correctness/tester.cpp | 31 -------------------------------
 test/correctness/tester.hpp |  4 ----
 4 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index f2574104..1546fbf5 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -397,6 +397,37 @@ template <> bool PrecisionSupported<half>(const Device &device) { return device.
 
 // =================================================================================================
 
+// Retrieves the squared difference, used for example for computing the L2 error
+template <typename T>
+double SquaredDifference(const T val1, const T val2) {
+  const auto difference = (val1 - val2);
+  return static_cast<double>(difference * difference);
+}
+
+// Compiles the default case for standard data-types
+template double SquaredDifference<float>(const float, const float);
+template double SquaredDifference<double>(const double, const double);
+
+// Specialisations for non-standard data-types
+template <>
+double SquaredDifference(const float2 val1, const float2 val2) {
+  const auto real = SquaredDifference(val1.real(), val2.real());
+  const auto imag = SquaredDifference(val1.imag(), val2.imag());
+  return real + imag;
+}
+template <>
+double SquaredDifference(const double2 val1, const double2 val2) {
+  const auto real = SquaredDifference(val1.real(), val2.real());
+  const auto imag = SquaredDifference(val1.imag(), val2.imag());
+  return real + imag;
+}
+template <>
+double SquaredDifference(const half val1, const half val2) {
+  return SquaredDifference(HalfToFloat(val1), HalfToFloat(val2));
+}
+
+// =================================================================================================
+
 // High-level info
 std::string GetDeviceType(const Device& device) {
   return device.Type();
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index f56226be..3f90906d 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -323,6 +323,12 @@ bool PrecisionSupported(const Device &device);
 
 // =================================================================================================
 
+// Retrieves the squared difference, used for example for computing the L2 error
+template <typename T>
+double SquaredDifference(const T val1, const T val2);
+
+// =================================================================================================
+
 // Device information in a specific CLBlast form
 std::string GetDeviceType(const Device& device);
 std::string GetDeviceVendor(const Device& device);
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index a10736ea..d6a346a6 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -539,37 +539,6 @@ bool TestSimilarity(const half val1, const half val2) {
 
 // =================================================================================================
 
-// Retrieves the squared difference, used for example for computing the L2 error
-template <typename T>
-double SquaredDifference(const T val1, const T val2) {
-  const auto difference = (val1 - val2);
-  return static_cast<double>(difference * difference);
-}
-
-// Compiles the default case for standard data-types
-template double SquaredDifference<float>(const float, const float);
-template double SquaredDifference<double>(const double, const double);
-
-// Specialisations for non-standard data-types
-template <>
-double SquaredDifference(const float2 val1, const float2 val2) {
-  const auto real = SquaredDifference(val1.real(), val2.real());
-  const auto imag = SquaredDifference(val1.imag(), val2.imag());
-  return real + imag;
-}
-template <>
-double SquaredDifference(const double2 val1, const double2 val2) {
-  const auto real = SquaredDifference(val1.real(), val2.real());
-  const auto imag = SquaredDifference(val1.imag(), val2.imag());
-  return real + imag;
-}
-template <>
-double SquaredDifference(const half val1, const half val2) {
-  return SquaredDifference(HalfToFloat(val1), HalfToFloat(val2));
-}
-
-// =================================================================================================
-
 // Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
 // routines. This function is specialised for the different data-types.
 template <> const std::vector<float> GetExampleScalars(const bool full_test) {
diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp
index 640f870a..7e17e53d 100644
--- a/test/correctness/tester.hpp
+++ b/test/correctness/tester.hpp
@@ -201,10 +201,6 @@ template <typename T> double getL2ErrorMargin();
 template <typename T>
 bool TestSimilarity(const T val1, const T val2);
 
-// Retrieves the squared difference, used for example for computing the L2 error
-template <typename T>
-double SquaredDifference(const T val1, const T val2);
-
 // Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
 // routines. This function is specialised for the different data-types.
 template <typename T>
-- 
cgit v1.2.3


From 03ebf14b97707f425519f46c3bb514f8ef5c93ad Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Mon, 13 Nov 2017 21:11:31 +0100
Subject: Made the exception dispatch function optionally silent

---
 src/utilities/clblast_exceptions.cpp | 4 ++--
 src/utilities/clblast_exceptions.hpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/utilities/clblast_exceptions.cpp b/src/utilities/clblast_exceptions.cpp
index 32526215..8038805d 100644
--- a/src/utilities/clblast_exceptions.cpp
+++ b/src/utilities/clblast_exceptions.cpp
@@ -45,7 +45,7 @@ RuntimeErrorCode::RuntimeErrorCode(StatusCode status, const std::string &subreas
 
 // =================================================================================================
 
-StatusCode DispatchException()
+StatusCode DispatchException(const bool silent)
 {
   const char *message = nullptr;
   StatusCode status;
@@ -66,7 +66,7 @@ StatusCode DispatchException()
     status = StatusCode::kUnknownError;
   }
 
-  if (message) {
+  if (message && !silent) {
     fprintf(stderr, "CLBlast: %s\n", message);
   }
   return status;
diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp
index a790be9c..5f2edbae 100644
--- a/src/utilities/clblast_exceptions.hpp
+++ b/src/utilities/clblast_exceptions.hpp
@@ -37,7 +37,7 @@ class RuntimeErrorCode : public ErrorCode<RuntimeError, StatusCode> {
 // =================================================================================================
 
 // Handles (most of the) runtime exceptions and converts them to StatusCode
-StatusCode DispatchException();
+StatusCode DispatchException(const bool silent = false);
 
 // Handles remaining exceptions and converts them to StatusCode::kUnhandledError
 StatusCode DispatchExceptionForC();
-- 
cgit v1.2.3


From b337bffbaf1df049ffc606e3683bfcdfc951c394 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 15 Nov 2017 22:44:44 +0100
Subject: Added exception handle with catch-all

---
 src/utilities/clblast_exceptions.cpp | 29 +++++++++++++++++++++++++++++
 src/utilities/clblast_exceptions.hpp |  1 +
 2 files changed, 30 insertions(+)

diff --git a/src/utilities/clblast_exceptions.cpp b/src/utilities/clblast_exceptions.cpp
index 8038805d..25e5f4be 100644
--- a/src/utilities/clblast_exceptions.cpp
+++ b/src/utilities/clblast_exceptions.cpp
@@ -72,6 +72,35 @@ StatusCode DispatchException(const bool silent)
   return status;
 }
 
+StatusCode DispatchExceptionCatchAll(const bool silent)
+{
+  const char *message = nullptr;
+  StatusCode status;
+
+  try {
+    throw;
+  } catch (BLASError &e) {
+    // no message is printed for invalid argument errors
+    status = e.status();
+  } catch (CLCudaAPIError &e) {
+    message = e.what();
+    status = static_cast<StatusCode>(e.status());
+  } catch (RuntimeErrorCode &e) {
+    message = e.what();
+    status = e.status();
+  } catch (Error<std::runtime_error> &e) {
+    message = e.what();
+    status = StatusCode::kUnknownError;
+  } catch (...) {
+    message = "unknown exception type";
+    status = StatusCode::kUnknownError;
+  }
+
+  if (message && !silent) {
+    fprintf(stderr, "CLBlast: %s\n", message);
+  }
+  return status;
+}
 // =================================================================================================
 
 StatusCode DispatchExceptionForC()
diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp
index 5f2edbae..9bd38187 100644
--- a/src/utilities/clblast_exceptions.hpp
+++ b/src/utilities/clblast_exceptions.hpp
@@ -38,6 +38,7 @@ class RuntimeErrorCode : public ErrorCode<RuntimeError, StatusCode> {
 
 // Handles (most of the) runtime exceptions and converts them to StatusCode
 StatusCode DispatchException(const bool silent = false);
+StatusCode DispatchExceptionCatchAll(const bool silent = false);
 
 // Handles remaining exceptions and converts them to StatusCode::kUnhandledError
 StatusCode DispatchExceptionForC();
-- 
cgit v1.2.3


From 0cd78bb6f9fbb3f30217e5b0f2267e76a43601a3 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 15 Nov 2017 22:47:06 +0100
Subject: Added kernel timing functionality to the utilities

---
 CMakeLists.txt           |  1 +
 src/utilities/timing.cpp | 79 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/utilities/timing.hpp |  9 ++++++
 3 files changed, 89 insertions(+)
 create mode 100644 src/utilities/timing.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cada61ab..ec757e06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -234,6 +234,7 @@ set(SOURCES
   src/database/database.cpp
   src/routines/common.cpp
   src/utilities/clblast_exceptions.cpp
+  src/utilities/timing.cpp
   src/utilities/utilities.cpp
   src/api_common.cpp
   src/cache.cpp
diff --git a/src/utilities/timing.cpp b/src/utilities/timing.cpp
new file mode 100644
index 00000000..188e4487
--- /dev/null
+++ b/src/utilities/timing.cpp
@@ -0,0 +1,79 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides helper functions for time measurement and such.
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <exception>
+
+#include "utilities/timing.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                      std::vector<size_t> global, const std::vector<size_t> &local) {
+  auto event = Event();
+
+  if (!local.empty()) {
+    // Tests for validity of the local thread sizes
+    if (local.size() > device.MaxWorkItemDimensions()) {
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
+    }
+    const auto max_work_item_sizes = device.MaxWorkItemSizes();
+    for (auto i=size_t{0}; i<local.size(); ++i) {
+      if (local[i] > max_work_item_sizes[i]) {
+        throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
+      }
+    }
+    auto local_size = size_t{1};
+    for (auto &item: local) { local_size *= item; }
+    if (local_size > device.MaxWorkGroupSize()) {
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
+    }
+
+    // Make sure the global thread sizes are at least equal to the local sizes
+    for (auto i=size_t{0}; i<global.size(); ++i) {
+      if (global[i] < local[i]) { global[i] = local[i]; }
+    }
+  }
+
+  // Tests for local memory usage
+  const auto local_mem_usage = kernel.LocalMemUsage(device);
+  if (!device.IsLocalMemoryValid(local_mem_usage)) {
+    throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
+  }
+
+  // Times the kernel
+  const auto run_kernel_func = [&]() {
+      kernel.Launch(queue, global, local, event.pointer());
+      event.WaitForCompletion();
+      queue.Finish();
+  };
+  return TimeFunction(num_runs, run_kernel_func);
+}
+
+double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                  std::vector<size_t> global, const std::vector<size_t> &local) {
+  try {
+    const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local);
+    printf(" %7.2lf ms |", time_ms);
+    return time_ms;
+  }
+  catch (...) {
+    const auto status_code = DispatchExceptionCatchAll(true);
+    printf("  error %3d |", static_cast<int>(status_code));
+    return -1.0; // invalid
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index bfad6147..fb5b9e78 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -21,6 +21,7 @@
 #include <chrono>
 
 #include "utilities/utilities.hpp"
+#include "routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -40,6 +41,14 @@ double TimeFunction(const size_t num_runs, F const &function) {
 
 // =================================================================================================
 
+double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                      std::vector<size_t> global, const std::vector<size_t> &local);
+
+double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                  std::vector<size_t> global, const std::vector<size_t> &local);
+
+// =================================================================================================
+
 using Timing = std::pair<size_t, double>;
 
 template <typename T, typename F>
-- 
cgit v1.2.3


From 1b2b46f2f073863a1faac9307583c9d5bb276e10 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 15 Nov 2017 22:49:35 +0100
Subject: Added first version of integrated and re-written auto-tuner

---
 CMakeLists.txt                |   6 +-
 src/tuning/configurations.cpp |  99 +++++++++++
 src/tuning/configurations.hpp |  73 ++++++++
 src/tuning/tuning.cpp         |   0
 src/tuning/tuning.hpp         | 403 +++++++++++++++++++++++++++++-------------
 src/utilities/timing.hpp      |  53 ------
 src/utilities/utilities.hpp   |   1 +
 7 files changed, 463 insertions(+), 172 deletions(-)
 create mode 100644 src/tuning/configurations.cpp
 create mode 100644 src/tuning/configurations.hpp
 create mode 100644 src/tuning/tuning.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ec757e06..0d30a38c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -378,8 +378,12 @@ endif()
 # the CLTune library (not included as part of the source).
 if(TUNERS)
 
+  set(TUNERS_COMMON
+      src/tuning/configurations.cpp
+      src/tuning/configurations.hpp
+      src/tuning/tuning.hpp)
+
   # Visual Studio requires the sources of non-exported objects/libraries
-  set(TUNERS_COMMON src/tuning/tuning.hpp)
   if(MSVC)
     set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
   endif()
diff --git a/src/tuning/configurations.cpp b/src/tuning/configurations.cpp
new file mode 100644
index 00000000..459d66b1
--- /dev/null
+++ b/src/tuning/configurations.cpp
@@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune).
+// This is only used for the optional tuner binaries and not part of the core of CLBlast.
+//
+// =================================================================================================
+
+#include <vector>
+#include <string>
+
+#include "tuning/configurations.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Finds all configurations. It also applies the user-defined constraints within.
+std::vector<Configuration> SetConfigurations(const std::vector<Parameter> parameters,
+                                             const Constraints& constraints) {
+  auto config = Configuration();
+  auto configurations = std::vector<Configuration>();
+  PopulateConfigurations(parameters, 0, config, configurations, constraints);
+  return configurations;
+}
+
+// Iterates recursively over all permutations of the user-defined parameters
+void PopulateConfigurations(const std::vector<Parameter> &parameters,
+                            const size_t index, const Configuration &config,
+                            std::vector<Configuration> &configuration,
+                            const Constraints& constraints) {
+
+  // End of the chain: all parameters are considered, store the resulting configuration if it is a
+  // valid one according to the constraints
+  if (index == parameters.size()) {
+    if (ValidConfiguration(config, constraints)) {
+      configuration.push_back(config);
+    }
+    return;
+  }
+
+  // This loop iterates over all values of the current parameter and calls this function recursively
+  Parameter parameter = parameters[index];
+  for (auto &value: parameter.second) {
+    auto config_copy = config;
+    config_copy[parameter.first] = value;
+    PopulateConfigurations(parameters, index+1, config_copy, configuration, constraints);
+  }
+}
+
+// Loops over all user-defined constraints to check whether or not the configuration is valid
+bool ValidConfiguration(const Configuration &config,
+                        const Constraints& constraints) {
+
+  // Iterates over all constraints
+  for (auto &constraint: constraints) {
+
+    // Finds the values of the parameters
+    auto values = std::vector<size_t>(constraint.parameters.size());
+    for (auto i=size_t{0}; i<constraint.parameters.size(); ++i) {
+      values[i] = config.at(constraint.parameters[i]);
+    }
+
+    // Checks this constraint for these values
+    if (!constraint.valid_if(values)) {
+      return false;
+    }
+  }
+
+  // Everything was OK: this configuration is valid
+  return true;
+}
+
+// Multiplies and/or dividers a thread configuration (local/global)
+std::vector<size_t> SetThreadConfiguration(const Configuration& config,
+                                           const std::vector<size_t> base,
+                                           const TransformVector& mul_config,
+                                           const TransformVector& div_config) {
+  auto result = base;
+  for (const auto &multipliers: mul_config) {
+    for (auto i = size_t{0}; i < multipliers.size(); ++i) {
+      result[i] *= config.at(multipliers[i]);
+    }
+  }
+  for (const auto &dividers: div_config) {
+    for (auto i = size_t{0}; i < dividers.size(); ++i) {
+      result[i] /= config.at(dividers[i]);
+    }
+  }
+  return result;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/configurations.hpp b/src/tuning/configurations.hpp
new file mode 100644
index 00000000..74679ff6
--- /dev/null
+++ b/src/tuning/configurations.hpp
@@ -0,0 +1,73 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune).
+// This is only used for the optional tuner binaries and not part of the core of CLBlast.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TUNING_CONFIGURATIONS_H_
+#define CLBLAST_TUNING_CONFIGURATIONS_H_
+
+#include <vector>
+#include <string>
+#include <map>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+using Configuration = std::map<std::string, size_t>;
+using Parameter = std::pair<std::string, std::vector<size_t>>;
+using TransformVector = std::vector<std::vector<std::string>>;
+
+// Helper structure holding a constraint on parameters. This constraint consists of a constraint
+// function object and a vector of parameter names represented as strings.
+using ConstraintFunction = std::function<bool(std::vector<size_t>)>;
+struct Constraint {
+  ConstraintFunction valid_if;
+  std::vector<std::string> parameters;
+};
+using Constraints = std::vector<Constraint>;
+
+// =================================================================================================
+
+// Initializes an empty configuration (vector of name/value pairs) and kicks-off the recursive
+// function to find all configurations. It also applies the user-defined constraints within.
+std::vector<Configuration> SetConfigurations(const std::vector<Parameter> parameters,
+                                             const Constraints& constraints);
+
+// Iterates recursively over all permutations of the user-defined parameters. This code creates
+// multiple chains, in which each chain selects a unique combination of values for all parameters.
+// At the end of each chain (when all parameters are considered), the function stores the result
+// into the configuration list.
+void PopulateConfigurations(const std::vector<Parameter> &parameters,
+                            const size_t index, const Configuration &config,
+                            std::vector<Configuration> &configuration,
+                            const Constraints& constraints);
+
+// Loops over all user-defined constraints to check whether or not the configuration is valid.
+// Assumes initially all configurations are valid, then returns false if one of the constraints has
+// not been met. Constraints consist of a user-defined function and a list of parameter names, which
+// are replaced by parameter values in this function.
+bool ValidConfiguration(const Configuration &config,
+                        const Constraints& constraints);
+
+// Processes multipliers and dividers to obtain the final thread configuration
+std::vector<size_t> SetThreadConfiguration(const Configuration& config,
+                                           const std::vector<size_t> base,
+                                           const TransformVector& mul_config,
+                                           const TransformVector& div_config);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TUNING_CONFIGURATIONS_H_
+#endif
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
new file mode 100644
index 00000000..e69de29b
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index bc9c0e03..b6edd1f7 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -7,26 +7,43 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the interface to the CLTune auto-tuner. This is only used for the optional
-// and stand-alone tuner binaries and not part of the core of CLBlast.
+// This file implements the generic CLBlast auto-tuner (inspired by CLTune). This is only used for
+//  the optional and stand-alone tuner binaries and not part of the core of CLBlast.
 //
 // =================================================================================================
 
-#ifndef CLBLAST_TUNING_H_
-#define CLBLAST_TUNING_H_
+#ifndef CLBLAST_TUNING_TUNING_H_
+#define CLBLAST_TUNING_TUNING_H_
 
 #include <vector>
 #include <string>
 #include <random>
 #include <utility>
-
-#include <cltune.h>
+#include <algorithm>
+#include <iostream>
 
 #include "utilities/utilities.hpp"
+#include "utilities/timing.hpp"
+#include "tuning/configurations.hpp"
 
 namespace clblast {
 // =================================================================================================
 
+// Constants holding start and end strings for terminal-output in colour
+#if defined(_WIN32)
+  const std::string kPrintError = "";
+  const std::string kPrintSuccess = "";
+  const std::string kPrintMessage = "";
+  const std::string kPrintEnd = "";
+#else
+  const std::string kPrintError = "\x1b[31m";
+  const std::string kPrintSuccess = "\x1b[32m";
+  const std::string kPrintMessage = "\x1b[1m";
+  const std::string kPrintEnd = "\x1b[0m";
+#endif
+
+// =================================================================================================
+
 // Structures for the tuners with all the default settings
 struct TunerDefaults {
 
@@ -41,15 +58,7 @@ struct TunerDefaults {
   // Other defaults
   size_t default_batch_count = 1;
   size_t default_num_runs = 10; // run every kernel this many times for averaging
-
-  // Search heuristic defaults
   double default_fraction = 1.0;
-  size_t default_swarm_size_PSO = 8;
-  double default_influence_global_PSO = 0.1;
-  double default_influence_local_PSO = 0.3;
-  double default_influence_random_PSO = 0.6;
-  size_t default_heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
-  double default_max_temp_ann = 1.0;
 };
 
 // Structures for the tuners with the remaining settings
@@ -68,6 +77,10 @@ struct TunerSettings {
   size_t size_c = 1;
   size_t size_temp = 1;
 
+  // Inputs and outputs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+  std::vector<size_t> inputs = {};
+  std::vector<size_t> outputs = {};
+
   // Sets the base thread configuration
   std::vector<size_t> global_size = {};
   std::vector<size_t> global_size_ref = {};
@@ -75,25 +88,79 @@ struct TunerSettings {
   std::vector<size_t> local_size_ref = {};
 
   // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
   TransformVector mul_local = {};
   TransformVector div_local = {};
   TransformVector mul_global = {};
   TransformVector div_global = {};
 
   // Sets the tuning parameters and their possible values
-  std::vector<std::pair<std::string, std::vector<size_t>>> parameters;
+  std::vector<Parameter> parameters;
 
   // Describes how to compute the performance metrics
   size_t metric_amount = 0;
   std::string performance_unit = "N/A";
-
-  // Returns which search heuristic to use
-  size_t heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
 };
 
 // =================================================================================================
 
+struct TuningResult { std::string name; double score; Configuration config; };
+
+void PrintTimingsToFileAsJSON(const std::string &filename,
+                              const Device& device, const Platform& platform,
+                              const std::vector<std::pair<std::string,std::string>> &metadata,
+                              const std::vector<TuningResult>& tuning_results) {
+  printf("* Writing results to '%s'\n", filename.c_str());
+  auto file = fopen(filename.c_str(), "w");
+  fprintf(file, "{\n");
+  for (auto &datum: metadata) {
+    fprintf(file, "  \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
+  }
+  fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
+  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
+  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
+  fprintf(file, "  \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
+  fprintf(file, "  \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
+  fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
+  fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
+  fprintf(file, "  \"results\": [\n");
+
+  // Loops over all results
+  auto num_results = tuning_results.size();
+  for (auto r = size_t{0}; r < num_results; ++r) {
+    auto result = tuning_results[r];
+    fprintf(file, "    {\n");
+    fprintf(file, "      \"kernel\": \"%s\",\n", result.name.c_str());
+    fprintf(file, "      \"time\": %.3lf,\n", result.score);
+
+    // Loops over all the parameters for this result
+    fprintf(file, "      \"parameters\": {");
+    auto num_configs = result.config.size();
+    auto p = size_t{0};
+    for (const auto parameter : result.config) {
+      fprintf(file, "\"%s\": %zu", parameter.first.c_str(), parameter.second);
+      if (p < num_configs -1 ) { fprintf(file, ","); }
+      ++p;
+    }
+    fprintf(file, "}\n");
+
+    // The footer
+    fprintf(file, "    }");
+    if (r < num_results - 1) { fprintf(file, ","); }
+    fprintf(file, "\n");
+  }
+  fprintf(file, "  ]\n");
+  fprintf(file, "}\n");
+  fclose(file);
+}
+
+void print_separator(const size_t parameters_size) {
+  printf("x------x-------x");
+  for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); }
+  printf("-x----------x------------x--------x-------------------x\n");
+}
+
+// =================================================================================================
+
 // Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
 // the results. Used for all types of kernel families. Note that this is a header-only function so
 // that it is automatically compiled for the various kernels (given as the 'C' template argument).
@@ -115,121 +182,221 @@ void Tuner(int argc, char* argv[]) {
     if (o == kArgK)        { args.k        = GetArgument(command_line_args, help, kArgK, defaults.default_k); }
     if (o == kArgAlpha)    { args.alpha    = GetArgument(command_line_args, help, kArgAlpha, GetScalar<T>()); }
     if (o == kArgBeta)     { args.beta     = GetArgument(command_line_args, help, kArgBeta, GetScalar<T>()); }
-    if (o == kArgFraction) { args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction); }
     if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, defaults.default_batch_count); }
-    if (o == kArgHeuristicSelection) {args.heuristic_selection = GetArgument(command_line_args, help, kArgHeuristicSelection, defaults.default_heuristic);  }
-    if (o == kArgPsoSwarmSize)   {args.pso_swarm_size      = GetArgument(command_line_args, help, kArgPsoSwarmSize , defaults.default_swarm_size_PSO);  }
-    if (o == kArgPsoInfGlobal)   {args.pso_inf_global      = GetArgument(command_line_args, help, kArgPsoInfGlobal, defaults.default_influence_global_PSO);  }
-    if (o == kArgPsoInfLocal)    {args.pso_inf_local       = GetArgument(command_line_args, help, kArgPsoInfLocal, defaults.default_influence_local_PSO);  }
-    if (o == kArgPsoInfRandom)   {args.pso_inf_random      = GetArgument(command_line_args, help, kArgPsoInfRandom, defaults.default_influence_random_PSO);  }
-    if (o == kArgAnnMaxTemp)     {args.ann_max_temperature = GetArgument(command_line_args, help, kArgAnnMaxTemp, defaults.default_max_temp_ann); }
   }
-  const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs);
-  fprintf(stdout, "%s\n", help.c_str());
+  args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction);
+  args.num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs);
+  const auto max_l2_norm = GetArgument(command_line_args, help, kArgMaxL2Norm, 1.0e-4);
+  printf("%s\n", help.c_str());
   const TunerSettings settings = C::GetTunerSettings(args);
 
   // Tests validity of the given arguments
   C::TestValidArguments(args);
 
+  // Initializes OpenCL
+  const auto platform = Platform(args.platform_id);
+  const auto device = Device(platform, args.device_id);
+  const auto context = Context(device);
+  auto queue = Queue(context, device);
+
   // Tests for validity of the precision and retrieves properties
-  auto isAMD = false;
-  auto isARM = false;
-  auto isGPU = false;
-  auto device_type = std::string{};
-  auto device_vendor = std::string{};
-  auto device_architecture = std::string{};
-  auto device_name = std::string{};
-  { // In a block such that the platform and the device are destroyed before initializing the tuner
-    const auto platform = Platform(args.platform_id);
-    const auto device = Device(platform, args.device_id);
-    if (!PrecisionSupported<T>(device)) {
-      printf("* Unsupported precision, skipping this tuning run\n\n");
-      return;
-    }
-    isAMD = device.IsAMD();
-    isARM = device.IsARM();
-    isGPU = device.IsGPU();
-    device_type = GetDeviceType(device);
-    device_vendor = GetDeviceVendor(device);
-    device_architecture = GetDeviceArchitecture(device);
-    device_name = GetDeviceName(device);
+  if (!PrecisionSupported<T>(device)) {
+    printf("* Unsupported precision, skipping this tuning run\n\n");
+    return;
   }
+  const auto device_type = GetDeviceType(device);
+  const auto device_vendor = GetDeviceVendor(device);
+  const auto device_architecture = GetDeviceArchitecture(device);
+  const auto device_name = GetDeviceName(device);
 
   // Creates input buffers with random data
-  auto x_vec = std::vector<T>(settings.size_x);
-  auto y_vec = std::vector<T>(settings.size_y);
-  auto a_mat = std::vector<T>(settings.size_a);
-  auto b_mat = std::vector<T>(settings.size_b);
-  auto c_mat = std::vector<T>(settings.size_c);
-  auto temp = std::vector<T>(settings.size_temp);
+  const auto buffer_sizes = std::vector<size_t>{
+      settings.size_x, settings.size_y,
+      settings.size_a, settings.size_b, settings.size_c,
+      settings.size_temp
+  };
   std::mt19937 mt(kSeed);
   std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
-  PopulateVector(x_vec, mt, dist);
-  PopulateVector(y_vec, mt, dist);
-  PopulateVector(a_mat, mt, dist);
-  PopulateVector(b_mat, mt, dist);
-  PopulateVector(c_mat, mt, dist);
-  PopulateVector(temp, mt, dist);
-
-  // Initializes the tuner for the chosen device
-  cltune::Tuner tuner(args.platform_id, args.device_id);
-
-  // Select the search method based on the command-line arguments
-  // If the tuner does not support the selected choice, full search will be returned.
-  auto method = settings.heuristic;
-  if      (method == 1) { tuner.UseRandomSearch(1.0/args.fraction); }
-  else if (method == 2) { tuner.UseAnnealing(1.0/args.fraction, args.ann_max_temperature); }
-  else if (method == 3) { tuner.UsePSO(1.0/args.fraction, args.pso_swarm_size, args.pso_inf_global,
-                                       args.pso_inf_local, args.pso_inf_random); }
-  else                  { tuner.UseFullSearch(); }
-
-  // Set extra settings for specific defines. This mimics src/routine.cc.
-  auto defines = std::string{""};
-  if (isAMD && isGPU) {
-    defines += "#define USE_CL_MAD 1\n";
-    defines += "#define USE_STAGGERED_INDICES 1\n";
+  auto source_buffers = std::vector<std::vector<T>>();
+  auto reference_buffers = std::vector<std::vector<T>>();
+  auto result_buffers = std::vector<std::vector<T>>();
+  auto device_buffers = std::vector<Buffer<T>>();
+  for (const auto size : buffer_sizes) {
+    auto host_buffer = std::vector<T>(size);
+    PopulateVector(host_buffer, mt, dist);
+    source_buffers.push_back(host_buffer);
+    auto reference_buffer = std::vector<T>(size);
+    reference_buffers.push_back(reference_buffer);
+    auto result_buffer = std::vector<T>(size);
+    result_buffers.push_back(result_buffer);
+    auto device_buffer = Buffer<T>(context, size);
+    device_buffers.push_back(device_buffer);
   }
-  if (isARM && isGPU) {
-    defines += "#define GLOBAL_MEM_FENCE 1\n";
-  }
-
-  // Loads the kernel sources and defines the kernel to tune
-  auto sources = defines + settings.sources;
-  auto id = tuner.AddKernelFromString(sources, settings.kernel_name, settings.global_size, settings.local_size);
-  tuner.SetReferenceFromString(sources, settings.kernel_name, settings.global_size_ref, settings.local_size_ref);
 
   // Sets the tunable parameters and their possible values
-  for (const auto &parameter: settings.parameters) {
-    tuner.AddParameter(id, parameter.first, parameter.second);
+  auto configurations = SetConfigurations(settings.parameters, C::SetConstraints());
+  printf("* Found %s%zu configuration(s)%s\n",
+         kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str());
+
+  // Select the search method (full search or a random fraction)
+  if (args.fraction != 0.0 && args.fraction != 1.0) {
+    const auto new_size = static_cast<size_t>(configurations.size() / args.fraction);
+    auto rng = std::default_random_engine{};
+    std::shuffle(std::begin(configurations), std::end(configurations), rng);
+    configurations.resize(new_size);
+    printf("* Exploring a random subset of %s%zu configuration(s)%s\n",
+           kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str());
   }
-  C::SetConstraints(tuner, id);
-  C::SetLocalMemorySize(tuner, id, args);
 
-  // Tests for a specific precision
-  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
-  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+  // Prints information about the parameters
+  printf("* Parameters explored: ");
+  for (const auto& parameter : settings.parameters) { printf("%s ", parameter.first.c_str()); }
+  printf("\n");
 
-  // Modifies the thread-sizes (both global and local) based on the parameters
-  for (auto &parameters: settings.mul_local) { tuner.MulLocalSize(id, parameters); }
-  for (auto &parameters: settings.div_local) { tuner.DivLocalSize(id, parameters); }
-  for (auto &parameters: settings.mul_global) { tuner.MulGlobalSize(id, parameters); }
-  for (auto &parameters: settings.div_global) { tuner.DivGlobalSize(id, parameters); }
+  // Prints the header of the table
+  printf("\n");
+  printf("|   ID | total |");
+  for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf("     "); }
+  printf("param | compiles |       time | %6s |            status |\n", settings.performance_unit.c_str());
+  print_separator(settings.parameters.size());
 
-  // Sets the function's arguments
-  C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
+  // First runs a reference example to compare against
+  try {
+    printf("|  ref |     - |");
+    for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf("     "); }
+    printf("    - |");
+
+
+    // Sets the input
+    for (const auto id : settings.inputs) {
+      device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
+    }
+
+    // Compiles the kernel
+    auto compiler_options = std::vector<std::string>();
+    const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name,
+                                           device, context, compiler_options);
+    auto kernel = Kernel(program, settings.kernel_name);
+    C::SetArguments(kernel, args, device_buffers);
+    printf("       %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str());
+
+    // Runs the kernel
+    const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device,
+                                    settings.global_size_ref, settings.local_size_ref);
+    printf("      - |");
+    if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); }
+
+    // Saves the result
+    for (const auto id : settings.outputs) {
+      device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]);
+    }
+    printf("      %sreference OK%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str());
+  }
+  catch (...) {
+    const auto status_code = DispatchExceptionCatchAll(true);
+    printf(" %d |\n", static_cast<int>(status_code));
+    printf("* Exception caught with status %d while running the reference, aborting\n",
+           static_cast<int>(status_code));
+    return;
+  }
+  print_separator(settings.parameters.size());
 
   // Starts the tuning process
-  tuner.SetNumRuns(num_runs);
-  tuner.Tune();
+  auto results = std::vector<TuningResult>();
+  for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) {
+    try {
+
+      const auto configuration = configurations[config_id];
+      printf("| %4zu | %5zu |", config_id + 1, configurations.size());
+      for (const auto& parameter : settings.parameters) {
+        printf("%5zu", configuration.at(parameter.first));
+      }
+      printf(" |");
+
+      // Sets the input
+      for (const auto id : settings.inputs) {
+        device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
+      }
+
+      // Sets the thread configuration
+      const auto global = SetThreadConfiguration(configuration, settings.global_size,
+                                                 settings.mul_global, settings.div_global);
+      const auto local = SetThreadConfiguration(configuration, settings.local_size,
+                                                settings.mul_local, settings.div_local);
+
+      // Sets the parameters for this configuration
+      auto kernel_source = std::string{""};
+      for (const auto &parameter : configuration) {
+        kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n";
+      }
+      kernel_source += settings.sources;
+
+      // Compiles the kernel
+      auto compiler_options = std::vector<std::string>();
+      const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name,
+                                             device, context, compiler_options);
+      auto kernel = Kernel(program, settings.kernel_name);
+      printf("       %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str());
+
+      // Runs the kernel
+      C::SetArguments(kernel, args, device_buffers);
+      const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local);
+
+      // Kernel run was not successful
+      if (time_ms == -1.0) {
+        printf("      - |");
+        printf("   %sinvalid config.%s |", kPrintError.c_str(), kPrintEnd.c_str());
+        printf(" <-- skipping\n");
+        continue;
+      }
+
+      // Compares the results
+      auto l2_error = 0.0;
+      for (const auto id : settings.outputs) {
+        device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]);
+        for (auto index = size_t{0}; index<buffer_sizes[id]; ++index) {
+          const auto diff = SquaredDifference(result_buffers[id][index], reference_buffers[id][index]);
+          l2_error += diff;
+        }
+        l2_error /= static_cast<double>(buffer_sizes[id]);
+        if (std::isnan(l2_error) || l2_error > max_l2_norm) {
+          printf("      - |");
+          printf(" %sL2 error %8.2e%s |", kPrintError.c_str(), l2_error, kPrintEnd.c_str());
+          throw std::runtime_error("L2 error too large");
+        }
+      }
+
+      // All was OK
+      results.push_back(TuningResult{settings.kernel_name, time_ms, configuration});
+      printf(" %6.1lf |", settings.metric_amount / (time_ms * 1.0e6));
+      printf("     %sresults match%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str());
+    }
+    catch (...) {
+      const auto status_code = DispatchExceptionCatchAll(true);
+      if (status_code != StatusCode::kUnknownError) {
+        printf("   %serror code %d%s |",
+               kPrintError.c_str(), static_cast<int>(status_code), kPrintEnd.c_str());
+      }
+      printf(" <-- skipping\n");
+    }
+  }
 
-  // Prints the results to screen
-  auto time_ms = tuner.PrintToScreen();
-  tuner.PrintFormatted();
+  // Completed the tuning process
+  print_separator(settings.parameters.size());
+  printf("\n");
+
+  // Computes the best results
+  auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
+  const auto best_configuration = std::min_element(results.begin(), results.end(), comparison);
+  const auto best_time_ms = best_configuration->score;
 
   // Also prints the performance of the best-case in terms of GB/s or GFLOPS
-  if (time_ms != 0.0) {
-    printf("[ -------> ] %.2lf ms", time_ms);
-    printf(" or %.1lf %s\n", settings.metric_amount/(time_ms*1.0e6), settings.performance_unit.c_str());
+  if (best_time_ms != 0.0) {
+    printf("\n");
+    printf("* Found best result %.2lf ms", best_time_ms);
+    printf(" or %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6),
+           settings.performance_unit.c_str());
+    printf("\n");
   }
 
   // Outputs the results as JSON to disk, including some meta-data
@@ -237,25 +404,25 @@ void Tuner(int argc, char* argv[]) {
   auto metadata = std::vector<std::pair<std::string,std::string>>{
     {"kernel_family", settings.kernel_family},
     {"precision", precision_string},
-    {"clblast_device_type", device_type},
-    {"clblast_device_vendor", device_vendor},
-    {"clblast_device_architecture", device_architecture},
-    {"clblast_device_name", device_name}
   };
   for (auto &o: defaults.options) {
-    if (o == kArgM)     { metadata.push_back({"arg_m", std::to_string(args.m)}); }
-    if (o == kArgN)     { metadata.push_back({"arg_n", std::to_string(args.n)}); }
-    if (o == kArgK)     { metadata.push_back({"arg_k", std::to_string(args.k)}); }
+    if (o == kArgM)     { metadata.push_back({"arg_m", ToString(args.m)}); }
+    if (o == kArgN)     { metadata.push_back({"arg_n", ToString(args.n)}); }
+    if (o == kArgK)     { metadata.push_back({"arg_k", ToString(args.k)}); }
     if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
     if (o == kArgBeta)  { metadata.push_back({"arg_beta", ToString(args.beta)}); }
     if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); }
   }
-  tuner.PrintJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json", metadata);
+  PrintTimingsToFileAsJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json",
+                           device, platform, metadata, results);
+
+  printf("* Completed tuning process\n");
+  printf("\n");
  
 }
 
 // =================================================================================================
 } // namespace clblast
 
-// CLBLAST_TUNING_H_
+// CLBLAST_TUNING_TUNING_H_
 #endif
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index fb5b9e78..3a5e2cff 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -72,59 +72,6 @@ std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t
   return timings;
 }
 
-// =================================================================================================
-
-using TuningParameter = std::pair<std::string, size_t>;
-using TuningParameters = std::vector<TuningParameter>;
-struct TuningResult { std::string name; double score; TuningParameters parameters; };
-
-void PrintTimingsToFileAsJSON(const std::string &filename,
-                              const Device& device, const Platform& platform,
-                              const std::vector<std::pair<std::string,std::string>> &metadata,
-                              const std::vector<TuningResult>& tuning_results) {
-  printf("[  STATUS  ] Writing results to '%s'\n", filename.c_str());
-  auto file = fopen(filename.c_str(), "w");
-  fprintf(file, "{\n");
-  for (auto &datum: metadata) {
-    fprintf(file, "  \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
-  }
-  fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
-  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
-  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
-  fprintf(file, "  \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
-  fprintf(file, "  \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
-  fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
-  fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
-  fprintf(file, "  \"results\": [\n");
-
-  // Loops over all results
-  auto num_results = tuning_results.size();
-  for (auto r = size_t{0}; r < num_results; ++r) {
-    auto result = tuning_results[r];
-    fprintf(file, "    {\n");
-    fprintf(file, "      \"kernel\": \"%s\",\n", result.name.c_str());
-    fprintf(file, "      \"time\": %.3lf,\n", result.score);
-
-    // Loops over all the parameters for this result
-    fprintf(file, "      \"parameters\": {");
-    auto num_configs = result.parameters.size();
-    for (auto p=size_t{0}; p<num_configs; ++p) {
-      auto config = result.parameters[p];
-      fprintf(file, "\"%s\": %zu", config.first.c_str(), config.second);
-      if (p < num_configs-1) { fprintf(file, ","); }
-    }
-    fprintf(file, "}\n");
-
-    // The footer
-    fprintf(file, "    }");
-    if (r < num_results - 1) { fprintf(file, ","); }
-    fprintf(file, "\n");
-  }
-  fprintf(file, "  ]\n");
-  fprintf(file, "}\n");
-  fclose(file);
-}
-
 // =================================================================================================
 } // namespace clblast
 
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 3f90906d..e26721b3 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -98,6 +98,7 @@ constexpr auto kArgDilationW = "dilationw";
 // The tuner-specific arguments in string form
 constexpr auto kArgFraction = "fraction";
 constexpr auto kArgHeuristicSelection = "heuristic";
+constexpr auto kArgMaxL2Norm = "max_l2_norm";
 // PSO tuner-specific arguments in string form
 constexpr auto kArgPsoSwarmSize = "pso_swarm_size";
 constexpr auto kArgPsoInfGlobal = "pso_inf_global";
-- 
cgit v1.2.3


From 2b8ad70b63eb4fab6a355941c5add5cb8370ba0a Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Thu, 16 Nov 2017 21:18:29 +0100
Subject: Added printing of the best parameters for the new tuner

---
 src/tuning/tuning.hpp | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index b6edd1f7..83f08ea9 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -109,7 +109,9 @@ void PrintTimingsToFileAsJSON(const std::string &filename,
                               const Device& device, const Platform& platform,
                               const std::vector<std::pair<std::string,std::string>> &metadata,
                               const std::vector<TuningResult>& tuning_results) {
-  printf("* Writing results to '%s'\n", filename.c_str());
+  auto num_results = tuning_results.size();
+  printf("* Writing a total of %zu results to '%s'\n", num_results, filename.c_str());
+
   auto file = fopen(filename.c_str(), "w");
   fprintf(file, "{\n");
   for (auto &datum: metadata) {
@@ -125,7 +127,6 @@ void PrintTimingsToFileAsJSON(const std::string &filename,
   fprintf(file, "  \"results\": [\n");
 
   // Loops over all results
-  auto num_results = tuning_results.size();
   for (auto r = size_t{0}; r < num_results; ++r) {
     auto result = tuning_results[r];
     fprintf(file, "    {\n");
@@ -384,26 +385,37 @@ void Tuner(int argc, char* argv[]) {
   // Completed the tuning process
   print_separator(settings.parameters.size());
   printf("\n");
+  if (results.size() == 0) { return; }
 
   // Computes the best results
   auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
   const auto best_configuration = std::min_element(results.begin(), results.end(), comparison);
   const auto best_time_ms = best_configuration->score;
+  if (best_time_ms == 0.0) { return; }
 
   // Also prints the performance of the best-case in terms of GB/s or GFLOPS
-  if (best_time_ms != 0.0) {
-    printf("\n");
-    printf("* Found best result %.2lf ms", best_time_ms);
-    printf(" or %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6),
-           settings.performance_unit.c_str());
-    printf("\n");
+  printf("\n");
+  printf("* Found best result %.2lf ms", best_time_ms);
+  printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6),
+         settings.performance_unit.c_str());
+  printf("* Best parameters: ");
+  auto best_string = std::string{""};
+  auto i = size_t{0};
+  for (const auto config : best_configuration->config) {
+    best_string += "" + config.first + "=" + ToString(config.second);
+    if (i < best_configuration->config.size() - 1) { best_string += " "; }
+    ++i;
   }
+  printf("%s\n\n", best_string.c_str());
 
   // Outputs the results as JSON to disk, including some meta-data
   auto precision_string = std::to_string(static_cast<size_t>(args.precision));
   auto metadata = std::vector<std::pair<std::string,std::string>>{
     {"kernel_family", settings.kernel_family},
     {"precision", precision_string},
+    {"best_kernel", best_configuration->name},
+    {"best_time", ToString(best_configuration->score)},
+    {"best_parameters", best_string}
   };
   for (auto &o: defaults.options) {
     if (o == kArgM)     { metadata.push_back({"arg_m", ToString(args.m)}); }
-- 
cgit v1.2.3


From d9cf206979bf2938b6790300756bab5c9d7987b6 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Thu, 16 Nov 2017 21:28:36 +0100
Subject: Removed dependency on CLTune

---
 CMakeLists.txt                 | 20 ++++---------
 README.md                      |  4 +--
 cmake/Modules/FindCLTune.cmake | 68 ------------------------------------------
 3 files changed, 6 insertions(+), 86 deletions(-)
 delete mode 100644 cmake/Modules/FindCLTune.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d30a38c..7ba512eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -156,15 +156,6 @@ elseif(CUDA)
   link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
 endif()
 
-# Locates the CLTune library in case the tuners need to be compiled. "FindCLTune.cmake" is included.
-if(TUNERS)
-  find_package(CLTune)
-  if(NOT CLTUNE_FOUND)
-    message(STATUS "Could NOT find CLTune, disabling the compilation of the tuners")
-    set(TUNERS OFF)
-  endif()
-endif()
-
 # Don't search for system libraries when cross-compiling
 if(${CMAKE_SYSTEM_NAME} STREQUAL Android)
   if(TESTS)
@@ -374,13 +365,14 @@ endif()
 
 # ==================================================================================================
 
-# This section contains all the code related to the tuners. These tuners require the presence of
-# the CLTune library (not included as part of the source).
+# This section contains all the code related to the tuners
+# TODO: Remove dependency on CLBlast
 if(TUNERS)
 
   set(TUNERS_COMMON
       src/tuning/configurations.cpp
       src/tuning/configurations.hpp
+      src/tuning/tuning.cpp
       src/tuning/tuning.hpp)
 
   # Visual Studio requires the sources of non-exported objects/libraries
@@ -391,14 +383,12 @@ if(TUNERS)
   # Adds tuning executables
   foreach(KERNEL ${KERNELS})
     add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
-    target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
-    target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
+    target_link_libraries(clblast_tuner_${KERNEL} clblast ${API_LIBRARIES})
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
   foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
     add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
-    target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
-    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC ${CLTUNE_INCLUDE_DIRS})
+    target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${API_LIBRARIES})
     install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
   endforeach()
 
diff --git a/README.md b/README.md
index 5f4b3d15..6c27af51 100644
--- a/README.md
+++ b/README.md
@@ -180,8 +180,6 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
 
     cmake -DTUNERS=ON ..
 
-Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.6.0 or higher).
-
 Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables.
 
 The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python (2.7 or 3.x) script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
@@ -416,7 +414,7 @@ More information
 Further information on CLBlast is available through the following links:
 
 * A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf).
-* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (May 2017). For CLTune, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
+* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (May 2017). For CLTune, the inspiration for the included auto-tuner, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
 
 
 Support us
diff --git a/cmake/Modules/FindCLTune.cmake b/cmake/Modules/FindCLTune.cmake
deleted file mode 100644
index 3a37576a..00000000
--- a/cmake/Modules/FindCLTune.cmake
+++ /dev/null
@@ -1,68 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-# width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# ==================================================================================================
-#
-# Defines the following variables:
-#   CLTUNE_FOUND          Boolean holding whether or not the CLTune library was found
-#   CLTUNE_INCLUDE_DIRS   The CLTune include directory
-#   CLTUNE_LIBRARIES      The CLTune library
-#
-# In case CLTune is not installed in the default directory, set the CLTUNE_ROOT variable to point to
-# the root of CLTune, such that 'cltune.h' can be found in $CLTUNE_ROOT/include. This can either be
-# done using an environmental variable (e.g. export CLTUNE_ROOT=/path/to/cltune) or using a CMake
-# variable (e.g. cmake -DCLTUNE_ROOT=/path/to/cltune ..).
-#
-# ==================================================================================================
-
-# Sets the possible install locations
-set(CLTUNE_HINTS
-  ${CLTUNE_ROOT}
-  $ENV{CLTUNE_ROOT}
-)
-set(CLTUNE_PATHS
-  /usr
-  /usr/local
-)
-
-# Finds the include directories
-find_path(CLTUNE_INCLUDE_DIRS
-  NAMES cltune.h
-  HINTS ${CLTUNE_HINTS}
-  PATH_SUFFIXES include inc include/x86_64 include/x64
-  PATHS ${CLTUNE_PATHS}
-  DOC "CLTune include header cltune.h"
-)
-mark_as_advanced(CLTUNE_INCLUDE_DIRS)
-
-# Finds the library
-find_library(CLTUNE_LIBRARIES
-  NAMES cltune
-  HINTS ${CLTUNE_HINTS}
-  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32
-  PATHS ${CLTUNE_PATHS}
-  DOC "CLTune library"
-)
-mark_as_advanced(CLTUNE_LIBRARIES)
-
-# ==================================================================================================
-
-# Notification messages
-if(NOT CLTUNE_INCLUDE_DIRS)
-    message(STATUS "Could NOT find 'cltune.h', install CLTune or set CLTUNE_ROOT")
-endif()
-if(NOT CLTUNE_LIBRARIES)
-    message(STATUS "Could NOT find CLTune library, install it or set CLTUNE_ROOT")
-endif()
-
-# Determines whether or not CLTune was found
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(CLTune DEFAULT_MSG CLTUNE_INCLUDE_DIRS CLTUNE_LIBRARIES)
-
-# ==================================================================================================
-- 
cgit v1.2.3


From f94d498a3773f838bcffb90fd56993a1583ad8ae Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Fri, 17 Nov 2017 20:57:46 +0100
Subject: Moved compilation function to separate file; removed dependency of
 tuners of the CLBlast library

---
 CMakeLists.txt            | 37 ++++++++++++------
 src/routines/common.cpp   | 78 -------------------------------------
 src/routines/common.hpp   |  7 +---
 src/tuning/tuning.hpp     |  1 +
 src/utilities/compile.cpp | 99 +++++++++++++++++++++++++++++++++++++++++++++++
 src/utilities/compile.hpp | 36 +++++++++++++++++
 src/utilities/timing.hpp  |  1 -
 7 files changed, 163 insertions(+), 96 deletions(-)
 create mode 100644 src/utilities/compile.cpp
 create mode 100644 src/utilities/compile.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ba512eb..f051e441 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -224,6 +224,7 @@ endif()
 set(SOURCES
   src/database/database.cpp
   src/routines/common.cpp
+  src/utilities/compile.cpp
   src/utilities/clblast_exceptions.cpp
   src/utilities/timing.cpp
   src/utilities/utilities.cpp
@@ -244,6 +245,7 @@ set(HEADERS  # such that they can be discovered by IDEs such as CLion and Visual
   src/routines/common.hpp
   src/routines/routines.hpp
   src/utilities/buffer_test.hpp
+  src/utilities/compile.hpp
   src/utilities/clblast_exceptions.hpp
   src/utilities/device_mapping.hpp
   src/utilities/msvc.hpp
@@ -366,29 +368,42 @@ endif()
 # ==================================================================================================
 
 # This section contains all the code related to the tuners
-# TODO: Remove dependency on CLBlast
 if(TUNERS)
 
   set(TUNERS_COMMON
+      src/utilities/compile.cpp
+      src/utilities/clblast_exceptions.cpp
+      src/utilities/timing.cpp
+      src/utilities/utilities.cpp
       src/tuning/configurations.cpp
+      src/tuning/tuning.cpp)
+  set(TUNERS_HEADERS  # such that they can be discovered by IDEs such as CLion and Visual Studio
+      src/utilities/compile.hpp
+      src/utilities/clblast_exceptions.hpp
+      src/utilities/timing.hpp
+      src/utilities/utilities.hpp
       src/tuning/configurations.hpp
-      src/tuning/tuning.cpp
       src/tuning/tuning.hpp)
 
-  # Visual Studio requires the sources of non-exported objects/libraries
-  if(MSVC)
-    set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
-  endif()
-
   # Adds tuning executables
   foreach(KERNEL ${KERNELS})
-    add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
-    target_link_libraries(clblast_tuner_${KERNEL} clblast ${API_LIBRARIES})
+    add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} ${TUNERS_HEADERS}
+                   src/tuning/kernels/${KERNEL}.cpp)
+    target_include_directories(clblast_tuner_${KERNEL} PUBLIC
+                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
+                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
+                           ${API_INCLUDE_DIRS})
+    target_link_libraries(clblast_tuner_${KERNEL} ${API_LIBRARIES})
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
   foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
-    add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
-    target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${API_LIBRARIES})
+    add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} ${TUNERS_HEADERS}
+                   src/tuning/routines/${ROUTINE_TUNER}.cpp)
+    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC
+                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
+                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
+                           ${API_INCLUDE_DIRS})
+    target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} ${API_LIBRARIES})
     install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
   endforeach()
 
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index c415d9fd..5b178e53 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -19,84 +19,6 @@
 namespace clblast {
 // =================================================================================================
 
-// Compiles a program from source code
-Program CompileFromSource(const std::string &source_string, const Precision precision,
-                          const std::string &routine_name,
-                          const Device& device, const Context& context,
-                          std::vector<std::string>& options) {
-  auto header_string = std::string{""};
-
-  header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
-
-  // Adds the name of the routine as a define
-  header_string += "#define ROUTINE_" + routine_name + "\n";
-
-  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
-  // which it is known to work with all OpenCL platforms.
-  if (device.IsNVIDIA() || device.IsARM()) {
-    header_string += "#define USE_INLINE_KEYWORD 1\n";
-  }
-
-  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
-  // performance, but might result in a reduced accuracy.
-  if (device.IsAMD() && device.IsGPU()) {
-    header_string += "#define USE_CL_MAD 1\n";
-  }
-
-  // For specific devices, use staggered/shuffled workgroup indices.
-  if (device.IsAMD() && device.IsGPU()) {
-    header_string += "#define USE_STAGGERED_INDICES 1\n";
-  }
-
-  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
-  // performance through better cache behaviour
-  if (device.IsARM() && device.IsGPU()) {
-    header_string += "#define GLOBAL_MEM_FENCE 1\n";
-  }
-
-  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
-  #ifdef CUDA_API
-    source_string +=
-      #include "kernels/opencl_to_cuda.h"
-    ;
-  #endif
-
-  // Loads the common header (typedefs and defines and such)
-  header_string +=
-    #include "kernels/common.opencl"
-  ;
-
-  // Prints details of the routine to compile in case of debugging in verbose mode
-  #ifdef VERBOSE
-    printf("[DEBUG] Compiling routine '%s-%s'\n",
-           routine_name.c_str(), ToString(precision).c_str());
-    const auto start_time = std::chrono::steady_clock::now();
-  #endif
-
-  // Compiles the kernel
-  auto program = Program(context, header_string + source_string);
-  try {
-    program.Build(device, options);
-  } catch (const CLCudaAPIBuildError &e) {
-    if (program.StatusIsCompilationWarningOrError(e.status())) {
-      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
-              program.GetBuildInfo(device).c_str());
-    }
-    throw;
-  }
-
-  // Prints the elapsed compilation time in case of debugging in verbose mode
-  #ifdef VERBOSE
-    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
-    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
-    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
-  #endif
-
-  return program;
-}
-
-// =================================================================================================
-
 // Enqueues a kernel, waits for completion, and checks for errors
 void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                std::vector<size_t> global, const std::vector<size_t> &local,
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 8a93d74a..06d001d9 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -20,17 +20,12 @@
 #include <vector>
 
 #include "utilities/utilities.hpp"
+#include "utilities/compile.hpp"
 #include "database/database.hpp"
 
 namespace clblast {
 // =================================================================================================
 
-// Compiles a program from source code
-Program CompileFromSource(const std::string &source_string, const Precision precision,
-                          const std::string &routine_name,
-                          const Device& device, const Context& context,
-                          std::vector<std::string>& options);
-
 // Enqueues a kernel, waits for completion, and checks for errors
 void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                std::vector<size_t> global, const std::vector<size_t> &local,
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 83f08ea9..c8a12b5b 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -23,6 +23,7 @@
 #include <iostream>
 
 #include "utilities/utilities.hpp"
+#include "utilities/compile.hpp"
 #include "utilities/timing.hpp"
 #include "tuning/configurations.hpp"
 
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
new file mode 100644
index 00000000..3c02d316
--- /dev/null
+++ b/src/utilities/compile.cpp
@@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the kernel compilation functions (see the header for more information).
+//
+// =================================================================================================
+
+#include <vector>
+#include <chrono>
+
+#include "routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+                          const std::string &routine_name,
+                          const Device& device, const Context& context,
+                          std::vector<std::string>& options) {
+  auto header_string = std::string{""};
+
+  header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
+
+  // Adds the name of the routine as a define
+  header_string += "#define ROUTINE_" + routine_name + "\n";
+
+  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
+  // which it is known to work with all OpenCL platforms.
+  if (device.IsNVIDIA() || device.IsARM()) {
+    header_string += "#define USE_INLINE_KEYWORD 1\n";
+  }
+
+  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
+  // performance, but might result in a reduced accuracy.
+  if (device.IsAMD() && device.IsGPU()) {
+    header_string += "#define USE_CL_MAD 1\n";
+  }
+
+  // For specific devices, use staggered/shuffled workgroup indices.
+  if (device.IsAMD() && device.IsGPU()) {
+    header_string += "#define USE_STAGGERED_INDICES 1\n";
+  }
+
+  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+  // performance through better cache behaviour
+  if (device.IsARM() && device.IsGPU()) {
+    header_string += "#define GLOBAL_MEM_FENCE 1\n";
+  }
+
+  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
+  #ifdef CUDA_API
+    source_string +=
+      #include "kernels/opencl_to_cuda.h"
+    ;
+  #endif
+
+  // Loads the common header (typedefs and defines and such)
+  header_string +=
+    #include "kernels/common.opencl"
+  ;
+
+  // Prints details of the routine to compile in case of debugging in verbose mode
+  #ifdef VERBOSE
+    printf("[DEBUG] Compiling routine '%s-%s'\n",
+           routine_name.c_str(), ToString(precision).c_str());
+    const auto start_time = std::chrono::steady_clock::now();
+  #endif
+
+  // Compiles the kernel
+  auto program = Program(context, header_string + source_string);
+  try {
+    program.Build(device, options);
+  } catch (const CLCudaAPIBuildError &e) {
+    if (program.StatusIsCompilationWarningOrError(e.status())) {
+      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
+              program.GetBuildInfo(device).c_str());
+    }
+    throw;
+  }
+
+  // Prints the elapsed compilation time in case of debugging in verbose mode
+  #ifdef VERBOSE
+    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
+  #endif
+
+  return program;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp
new file mode 100644
index 00000000..bd4686eb
--- /dev/null
+++ b/src/utilities/compile.hpp
@@ -0,0 +1,36 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the CLBlast way to compile a kernel from source, used for the library and for
+// the auto-tuners.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_UTILITIES_COMPILE_H_
+#define CLBLAST_UTILITIES_COMPILE_H_
+
+#include <string>
+#include <vector>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+                          const std::string &routine_name,
+                          const Device& device, const Context& context,
+                          std::vector<std::string>& options);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_UTILITIES_COMPILE_H_
+#endif
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index 3a5e2cff..e8040058 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -21,7 +21,6 @@
 #include <chrono>
 
 #include "utilities/utilities.hpp"
-#include "routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
-- 
cgit v1.2.3


From 8a5a5e031e3552ef36d7b3a16ecf5cef6cdb4614 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Fri, 17 Nov 2017 20:58:36 +0100
Subject: Moved some tuning functions from .hpp to .cpp

---
 src/tuning/tuning.cpp | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/tuning/tuning.hpp | 52 ++-----------------------------
 2 files changed, 86 insertions(+), 50 deletions(-)

diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index e69de29b..bd8337b4 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -0,0 +1,84 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the generic CLBlast auto-tuner (inspired by CLTune). This is only used for
+//  the optional and stand-alone tuner binaries and not part of the core of CLBlast.
+//
+// =================================================================================================
+
+#include <vector>
+#include <string>
+#include <random>
+#include <utility>
+#include <algorithm>
+#include <iostream>
+
+#include "utilities/utilities.hpp"
+#include "tuning/tuning.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+void PrintTimingsToFileAsJSON(const std::string &filename,
+                              const Device& device, const Platform& platform,
+                              const std::vector<std::pair<std::string,std::string>> &metadata,
+                              const std::vector<TuningResult>& tuning_results) {
+  auto num_results = tuning_results.size();
+  printf("* Writing a total of %zu results to '%s'\n", num_results, filename.c_str());
+
+  auto file = fopen(filename.c_str(), "w");
+  fprintf(file, "{\n");
+  for (auto &datum: metadata) {
+    fprintf(file, "  \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
+  }
+  fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
+  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
+  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
+  fprintf(file, "  \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
+  fprintf(file, "  \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
+  fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
+  fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
+  fprintf(file, "  \"results\": [\n");
+
+  // Loops over all results
+  for (auto r = size_t{0}; r < num_results; ++r) {
+    auto result = tuning_results[r];
+    fprintf(file, "    {\n");
+    fprintf(file, "      \"kernel\": \"%s\",\n", result.name.c_str());
+    fprintf(file, "      \"time\": %.3lf,\n", result.score);
+
+    // Loops over all the parameters for this result
+    fprintf(file, "      \"parameters\": {");
+    auto num_configs = result.config.size();
+    auto p = size_t{0};
+    for (const auto parameter : result.config) {
+      fprintf(file, "\"%s\": %zu", parameter.first.c_str(), parameter.second);
+      if (p < num_configs -1 ) { fprintf(file, ","); }
+      ++p;
+    }
+    fprintf(file, "}\n");
+
+    // The footer
+    fprintf(file, "    }");
+    if (r < num_results - 1) { fprintf(file, ","); }
+    fprintf(file, "\n");
+  }
+  fprintf(file, "  ]\n");
+  fprintf(file, "}\n");
+  fclose(file);
+}
+
+void print_separator(const size_t parameters_size) {
+  printf("x------x-------x");
+  for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); }
+  printf("-x----------x------------x--------x-------------------x\n");
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index c8a12b5b..41f394c1 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -109,57 +109,9 @@ struct TuningResult { std::string name; double score; Configuration config; };
 void PrintTimingsToFileAsJSON(const std::string &filename,
                               const Device& device, const Platform& platform,
                               const std::vector<std::pair<std::string,std::string>> &metadata,
-                              const std::vector<TuningResult>& tuning_results) {
-  auto num_results = tuning_results.size();
-  printf("* Writing a total of %zu results to '%s'\n", num_results, filename.c_str());
-
-  auto file = fopen(filename.c_str(), "w");
-  fprintf(file, "{\n");
-  for (auto &datum: metadata) {
-    fprintf(file, "  \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
-  }
-  fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
-  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
-  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
-  fprintf(file, "  \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
-  fprintf(file, "  \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
-  fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
-  fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
-  fprintf(file, "  \"results\": [\n");
-
-  // Loops over all results
-  for (auto r = size_t{0}; r < num_results; ++r) {
-    auto result = tuning_results[r];
-    fprintf(file, "    {\n");
-    fprintf(file, "      \"kernel\": \"%s\",\n", result.name.c_str());
-    fprintf(file, "      \"time\": %.3lf,\n", result.score);
-
-    // Loops over all the parameters for this result
-    fprintf(file, "      \"parameters\": {");
-    auto num_configs = result.config.size();
-    auto p = size_t{0};
-    for (const auto parameter : result.config) {
-      fprintf(file, "\"%s\": %zu", parameter.first.c_str(), parameter.second);
-      if (p < num_configs -1 ) { fprintf(file, ","); }
-      ++p;
-    }
-    fprintf(file, "}\n");
-
-    // The footer
-    fprintf(file, "    }");
-    if (r < num_results - 1) { fprintf(file, ","); }
-    fprintf(file, "\n");
-  }
-  fprintf(file, "  ]\n");
-  fprintf(file, "}\n");
-  fclose(file);
-}
+                              const std::vector<TuningResult>& tuning_results);
 
-void print_separator(const size_t parameters_size) {
-  printf("x------x-------x");
-  for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); }
-  printf("-x----------x------------x--------x-------------------x\n");
-}
+void print_separator(const size_t parameters_size);
 
 // =================================================================================================
 
-- 
cgit v1.2.3


From 7a54494577ccee401b63cfa82688661fc66f59a4 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 19 Nov 2017 12:58:41 +0100
Subject: Modified the kernel tuners to use the newly integrated auto-tuner

---
 src/tuning/kernels/copy_fast.cpp      | 26 +++++-----
 src/tuning/kernels/copy_pad.cpp       | 42 ++++++++-------
 src/tuning/kernels/transpose_fast.cpp | 31 +++++------
 src/tuning/kernels/transpose_pad.cpp  | 47 ++++++++---------
 src/tuning/kernels/xaxpy.cpp          | 26 +++++-----
 src/tuning/kernels/xdot.cpp           | 46 ++++++++---------
 src/tuning/kernels/xgemm.cpp          | 81 +++++++++++------------------
 src/tuning/kernels/xgemm_direct.cpp   | 96 ++++++++++++++---------------------
 src/tuning/kernels/xgemv.cpp          | 73 +++++++++++---------------
 src/tuning/kernels/xger.cpp           | 44 ++++++++--------
 10 files changed, 218 insertions(+), 294 deletions(-)

diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 068c5f1b..462107d3 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels.
+// This file uses the auto-tuner to tune the copy OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TuneCopy {
     settings.kernel_family = "copy";
     settings.kernel_name = "CopyMatrixFast";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/copy_fast.opencl"
     ;
@@ -51,6 +50,10 @@ class TuneCopy {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,20 +81,15 @@ class TuneCopy {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(2, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(3, GetRealArg(args.alpha));
   }
 };
 
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 7102d05d..24557517 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels.
+// This file uses the auto-tuner to tune the pad OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TunePad {
     settings.kernel_family = "pad";
     settings.kernel_name = "CopyPadMatrix";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/copy_pad.opencl"
     ;
@@ -51,6 +50,10 @@ class TunePad {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,28 +81,23 @@ class TunePad {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(0);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.m));
+    kernel.SetArgument(3, 0);
+    kernel.SetArgument(4, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(5, static_cast<int>(args.m));
+    kernel.SetArgument(6, static_cast<int>(args.n));
+    kernel.SetArgument(7, static_cast<int>(args.m));
+    kernel.SetArgument(8, 0);
+    kernel.SetArgument(9, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(10, GetRealArg(args.alpha));
+    kernel.SetArgument(11, 0);
   }
 };
 
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 56726903..1e0d3c7b 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels.
+// This file uses the auto-tuner to tune the transpose OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TuneTranspose {
     settings.kernel_family = "transpose";
     settings.kernel_name = "TransposeMatrixFast";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/transpose_fast.opencl"
     ;
@@ -51,6 +50,10 @@ class TuneTranspose {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,25 +81,15 @@ class TuneTranspose {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"});
-  }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(2, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(3, GetRealArg(args.alpha));
   }
 };
 
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index dc46e903..087f8e67 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels.
+// This file uses the auto-tuner to tune the pad-transpose OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TunePadTranspose {
     settings.kernel_family = "padtranspose";
     settings.kernel_name = "TransposePadMatrix";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/transpose_pad.opencl"
     ;
@@ -51,6 +50,10 @@ class TunePadTranspose {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -77,33 +80,23 @@ class TunePadTranspose {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"});
-  }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(0);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.m));
+    kernel.SetArgument(3, 0);
+    kernel.SetArgument(4, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(5, static_cast<int>(args.n));
+    kernel.SetArgument(6, static_cast<int>(args.m));
+    kernel.SetArgument(7, static_cast<int>(args.n));
+    kernel.SetArgument(8, 0);
+    kernel.SetArgument(9, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(10, GetRealArg(args.alpha));
+    kernel.SetArgument(11, 0);
   }
 };
 
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index e201949a..d843ea78 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xaxpy OpenCL kernels.
+// This file uses the auto-tuner to tune the xaxpy OpenCL kernels.
 //
 // =================================================================================================
 
@@ -41,7 +41,6 @@ class TuneXaxpy {
     settings.kernel_family = "xaxpy";
     settings.kernel_name = "XaxpyFastest";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level1/level1.opencl"
 #include "../src/kernels/level1/xaxpy.opencl"
     ;
@@ -50,6 +49,10 @@ class TuneXaxpy {
     settings.size_x = args.n;
     settings.size_y = args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1};
+    settings.outputs = {1};
+
     // Sets the base thread configuration
     settings.global_size = {args.n};
     settings.global_size_ref = settings.global_size;
@@ -80,20 +83,15 @@ class TuneXaxpy {
       throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW");
     }
   }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentInput(x_vec);
-    tuner.AddArgumentOutput(y_vec);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.n));
+    kernel.SetArgument(1, GetRealArg(args.alpha));
+    kernel.SetArgument(2, buffers[0]()); // 0 == X vector
+    kernel.SetArgument(3, buffers[1]()); // 1 == Y vector
   }
 };
 
diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp
index fb532680..12350657 100644
--- a/src/tuning/kernels/xdot.cpp
+++ b/src/tuning/kernels/xdot.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xdot OpenCL kernels. Note that the results are
+// This file uses the auto-tuner to tune the xdot OpenCL kernels. Note that the results are
 // not verified, since the result is not final and depends on the WGS2 parameter.
 //
 // =================================================================================================
@@ -42,7 +42,6 @@ class TuneXdot {
     settings.kernel_family = "xdot_"+std::to_string(V);
     settings.kernel_name = (V==1) ? "Xdot" : "XdotEpilogue";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level1/xdot.opencl"
     ;
 
@@ -51,6 +50,10 @@ class TuneXdot {
     settings.size_y = args.n;
     settings.size_temp = args.n; // Worst case
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1, 5};
+    settings.outputs = {}; // no output checking
+
     // Sets the base thread configuration
     settings.global_size = (V==1) ? std::vector<size_t>{2*64} : std::vector<size_t>{1};
     settings.global_size_ref = (V==1) ? std::vector<size_t>{2*64*64} : std::vector<size_t>{64};
@@ -58,8 +61,8 @@ class TuneXdot {
     settings.local_size_ref = {64};
 
     // Transforms the thread configuration based on the parameters
-    settings.mul_local = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}};
-    settings.mul_global = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}};
+    settings.mul_local = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}};
+    settings.mul_global = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}};
 
     // Sets the tuning parameters and their possible values
     settings.parameters = {
@@ -75,31 +78,26 @@ class TuneXdot {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &temp) {
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
     if (V == 1) {
-      tuner.AddArgumentScalar(static_cast<int>(args.n));
-      tuner.AddArgumentInput(x_vec);
-      tuner.AddArgumentScalar(0);
-      tuner.AddArgumentScalar(1);
-      tuner.AddArgumentInput(y_vec);
-      tuner.AddArgumentScalar(0);
-      tuner.AddArgumentScalar(1);
-      tuner.AddArgumentInput(temp); // No output checking for the result - size varies
-      tuner.AddArgumentScalar(static_cast<int>(false));
+      kernel.SetArgument(0, static_cast<int>(args.n));
+      kernel.SetArgument(1, buffers[0]()); // 0 == X vector
+      kernel.SetArgument(2, 0);
+      kernel.SetArgument(3, 1);
+      kernel.SetArgument(4, buffers[1]()); // 1 == Y vector
+      kernel.SetArgument(5, 0);
+      kernel.SetArgument(6, 1);
+      kernel.SetArgument(7, buffers[5]()); // 5 == temp; no output checking - size varies
+      kernel.SetArgument(8, static_cast<int>(false));
     }
     else {
-      tuner.AddArgumentInput(temp);
-      tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere
-      tuner.AddArgumentScalar(0);
+      kernel.SetArgument(0, buffers[5]()); // 5 == temp
+      kernel.SetArgument(1, buffers[0]()); // 0 == X vector; no output checking - size varies
+      kernel.SetArgument(2, 0);
     }
   }
 };
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 6dcdf68b..16e32988 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. There are two variations:
+// This file uses the auto-tuner to tune the xgemm OpenCL kernels. There are two variations:
 // - V==1: This tests some limited set of tuning parameters exhaustively.
 // - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
 //
@@ -38,7 +38,6 @@ class TuneXgemm {
     settings.default_k = 1024;
     settings.default_fraction = (V==1) ? 1.0 : 512.0; // test all or sample randomly
     settings.default_num_runs = 2;
-    settings.default_heuristic = static_cast<size_t>(cltune::SearchMethod::RandomSearch);
     return settings;
   }
 
@@ -50,7 +49,6 @@ class TuneXgemm {
     settings.kernel_family = (V==1) ? "xgemm_1" : "xgemm_2";
     settings.kernel_name = "Xgemm";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/xgemm_part1.opencl"
 #include "../src/kernels/level3/xgemm_part2.opencl"
 #include "../src/kernels/level3/xgemm_part3.opencl"
@@ -61,6 +59,10 @@ class TuneXgemm {
     settings.size_b = args.n * args.k;
     settings.size_c = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3, 4};
+    settings.outputs = {4};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -114,74 +116,51 @@ class TuneXgemm {
     settings.metric_amount = 2 * args.m * args.n * args.k;
     settings.performance_unit = "GFLOPS";
 
-    // Returns which search heuristic to use
-    if (V==1) { settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); }
-    else {
-      // Use full-search to explore all parameter combinations or another strategy to search only a
-      // part of the parameter values. The fraction is set as a command-line argument.
-      if (args.fraction == 1.0 || args.fraction == 0.0) {
-        settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
-      } else {
-        settings.heuristic = args.heuristic_selection;
-      }
-    }
-
     return settings;
   }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints
-  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+  static std::vector<Constraint> SetConstraints() {
+    auto constraints = std::vector<Constraint>();
     auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
     auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
     auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
     // Requirement for unrolling the KWG loop
-    tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"});
+    constraints.push_back({MultipleOfX, {"KWG", "KWI"}});
     // Required for integer MWI and NWI
-    tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"});
+    constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}});
+    constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}});
     // Required for integer MWIA and NWIB
-    tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"});
+    constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}});
+    constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}});
     // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...)
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"});
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"});
+    constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}});
+    constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}});
 
     // Extra constraints for variation 1 to limit the set of options significantly
     if (V==1) {
       auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
-      tuner.AddConstraint(id, IsEqual, {"MDIMC", "MDIMA"});
-      tuner.AddConstraint(id, IsEqual, {"NDIMC", "NDIMB"});
-      tuner.AddConstraint(id, IsEqual, {"SA", "SB"});
+      constraints.push_back({IsEqual, {"MDIMC", "MDIMA"}});
+      constraints.push_back({IsEqual, {"NDIMC", "NDIMB"}});
+      constraints.push_back({IsEqual, {"SA", "SB"}});
     }
-  }
-
-  // Sets the local memory size
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG",
-                                                    "SB", "KWG", "NWG"});
+    return constraints;
   }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.k));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(GetRealArg(args.beta));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentInput(b_mat);
-    tuner.AddArgumentOutput(c_mat);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(0);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.k));
+    kernel.SetArgument(3, GetRealArg(args.alpha));
+    kernel.SetArgument(4, GetRealArg(args.beta));
+    kernel.SetArgument(5, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(6, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(7, buffers[4]()); // 4 == C matrix
+    kernel.SetArgument(8, 0);
+    kernel.SetArgument(9, 0);
   }
 };
 
diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp
index 619fb37a..60a983b4 100644
--- a/src/tuning/kernels/xgemm_direct.cpp
+++ b/src/tuning/kernels/xgemm_direct.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations:
+// This file uses the auto-tuner to tune the direct xgemm kernels. There are two variations:
 // - V==1: This tests some limited set of tuning parameters exhaustively.
 // - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
 //
@@ -36,9 +36,8 @@ class TuneXgemmDirect {
     settings.default_m = 256;
     settings.default_n = 256;
     settings.default_k = 256;
-    settings.default_fraction = (V==1) ? 1.0 : 32.0; // test all or sample randomly
+    settings.default_fraction = (V==1) ? 1.0 : 64.0; // test all or sample randomly
     settings.default_num_runs = 4;
-    settings.default_heuristic = static_cast<size_t>(cltune::SearchMethod::RandomSearch);
     return settings;
   }
 
@@ -50,7 +49,6 @@ class TuneXgemmDirect {
     settings.kernel_family = (V==1) ? "xgemm_direct_1" : "xgemm_direct_2";
     settings.kernel_name = "XgemmDirectTN";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/xgemm_direct_part1.opencl"
 #include "../src/kernels/level3/xgemm_direct_part2.opencl"
 #include "../src/kernels/level3/xgemm_direct_part3.opencl"
@@ -61,6 +59,10 @@ class TuneXgemmDirect {
     settings.size_b = args.n * args.k;
     settings.size_c = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3, 4};
+    settings.outputs = {4};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -89,7 +91,7 @@ class TuneXgemmDirect {
     }
     else { // a lot more tuning parameters - has to be sampled randomly, too much to test all
       settings.parameters = {
-        {"WGD", {8, 16, 32, 64, 128}},
+        {"WGD", {8, 16, 32, 64}},
         {"MDIMCD", {8, 16, 32}},
         {"NDIMCD", {8, 16, 32}},
         {"MDIMAD", {8, 16, 32}},
@@ -106,79 +108,57 @@ class TuneXgemmDirect {
     settings.metric_amount = 2 * args.m * args.n * args.k;
     settings.performance_unit = "GFLOPS";
 
-    // Returns which search heuristic to use
-    if (V==1) { settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); }
-    else {
-      // Use full-search to explore all parameter combinations or another strategy to search only a
-      // part of the parameter values. The fraction is set as a command-line argument.
-      if (args.fraction == 1.0 || args.fraction == 0.0) {
-        settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
-      } else {
-        settings.heuristic = args.heuristic_selection;
-      }
-    }
-
     return settings;
   }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints
-  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+  static std::vector<Constraint> SetConstraints() {
+    auto constraints = std::vector<Constraint>();
     auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
     auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
     auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
     // Requirement for unrolling the WGD loop
-    tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"});
+    constraints.push_back({MultipleOfX, {"WGD", "KWID"}});
     // Required for integer MWID and NWID
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}});
     // Required for integer MWIAD and NWIBD
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}});
     // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...)
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"});
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"});
+    constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}});
+    constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}});
 
     // Extra constraints for variation 1 to limit the set of options significantly
     if (V==1) {
       auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
-      tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"});
-      tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"});
+      constraints.push_back({IsEqual, {"MDIMCD", "MDIMAD"}});
+      constraints.push_back({IsEqual, {"NDIMCD", "NDIMBD"}});
     }
-  }
-
-  // Sets the local memory size
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"});
+    return constraints;
   }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.k));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(GetRealArg(args.beta));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(0); // a_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld
-    tuner.AddArgumentInput(b_mat);
-    tuner.AddArgumentScalar(0); // b_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld
-    tuner.AddArgumentOutput(c_mat);
-    tuner.AddArgumentScalar(0); // c_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld
-    tuner.AddArgumentScalar(1); // c_do_transpose
-    tuner.AddArgumentScalar(0); // a_conjugate
-    tuner.AddArgumentScalar(0); // b_conjugate
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.k));
+    kernel.SetArgument(3, GetRealArg(args.alpha));
+    kernel.SetArgument(4, GetRealArg(args.beta));
+    kernel.SetArgument(5, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(6, 0); // a_offset
+    kernel.SetArgument(7, static_cast<int>(args.k)); // a_ld
+    kernel.SetArgument(8, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(9, 0); // b_offset
+    kernel.SetArgument(10, static_cast<int>(args.n)); // b_ld
+    kernel.SetArgument(11, buffers[4]()); // 4 == C matrix
+    kernel.SetArgument(12, 0); // c_offset
+    kernel.SetArgument(13, static_cast<int>(args.n)); // c_ld
+    kernel.SetArgument(14, 1); // c_do_transpose
+    kernel.SetArgument(15, 0); // a_conjugate
+    kernel.SetArgument(16, 0); // b_conjugate
   }
 };
 
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index e66b15f1..3eadd32b 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned:
+// This file uses the auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned:
 // 1: The full version of the kernel
 // 2: The fast version for non-transposed matrices
 // 3: The fast version for transposed matrices
@@ -45,7 +45,6 @@ class TuneXgemv {
     settings.kernel_family = (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot");
     settings.kernel_name = (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot");
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level2/xgemv.opencl"
 #include "../src/kernels/level2/xgemv_fast.opencl"
     ;
@@ -55,6 +54,10 @@ class TuneXgemv {
     settings.size_y = args.m;
     settings.size_a = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1, 2};
+    settings.outputs = {1};
+
     // Sets the base thread configuration
     settings.global_size = {args.m};
     settings.global_size_ref = settings.global_size;
@@ -63,9 +66,7 @@ class TuneXgemv {
 
     // Transforms the thread configuration based on the parameters
     settings.mul_local = {{"WGS"+std::to_string(V)}};
-    settings.div_global = (V==1 || V==2) ?
-                          TunerSettings::TransformVector{{"WPT"+std::to_string(V)}} :
-                          TunerSettings::TransformVector{};
+    settings.div_global = (V==1 || V==2) ? TransformVector{{"WPT"+std::to_string(V)}} : TransformVector{};
 
     // Sets the tuning parameters and their possible values
     if (V==1) {
@@ -98,53 +99,41 @@ class TuneXgemv {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+  static std::vector<Constraint> SetConstraints() {
+    auto constraints = std::vector<Constraint>();
     if (V==2 || V==3) {
       auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
-      tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
+      constraints.push_back({MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}});
     }
     if (V==3) {
       auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; };
-      tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
-    }
-  }
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    if (V==1 || V==2) {
-      auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
-      tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
-    }
-    else {
-      auto LocalMemorySize = [args] (std::vector<size_t> v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); };
-      tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
+      constraints.push_back({LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}});
     }
+    return constraints;
   }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &) {
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
     auto a_rotated = (V==3) ? 1 : 0;
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(GetRealArg(args.beta));
-    tuner.AddArgumentScalar(static_cast<int>(a_rotated));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentInput(x_vec);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(1);
-    tuner.AddArgumentOutput(y_vec);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(1);
-    tuner.AddArgumentScalar(0); // Conjugate transpose
-    tuner.AddArgumentScalar(0); // Additional parameter
-    tuner.AddArgumentScalar(0); // Banded 'kl'
-    tuner.AddArgumentScalar(0); // Banded 'ku'
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, GetRealArg(args.alpha));
+    kernel.SetArgument(3, GetRealArg(args.beta));
+    kernel.SetArgument(4, a_rotated);
+    kernel.SetArgument(5, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(6, 0);
+    kernel.SetArgument(7, static_cast<int>(args.m));
+    kernel.SetArgument(8, buffers[0]()); // 0 == X vector
+    kernel.SetArgument(9, 0);
+    kernel.SetArgument(10, 1);
+    kernel.SetArgument(11, buffers[1]()); // 1 == Y vector
+    kernel.SetArgument(12, 0);
+    kernel.SetArgument(13, 1);
+    kernel.SetArgument(14, 0); // Conjugate transpose
+    kernel.SetArgument(15, 0); // Additional parameter
+    kernel.SetArgument(16, 0); // Banded 'kl'
+    kernel.SetArgument(17, 0); // Banded 'ku'
   }
 };
 
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index c2eb1d31..745e553f 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels.
+// This file uses the auto-tuner to tune the xger OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TuneXger {
     settings.kernel_family = "xger";
     settings.kernel_name = "Xger";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level2/level2.opencl"
 #include "../src/kernels/level2/xger.opencl"
     ;
@@ -52,6 +51,10 @@ class TuneXger {
     settings.size_y = args.n;
     settings.size_a = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1, 2};
+    settings.outputs = {2};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,29 +81,24 @@ class TuneXger {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentInput(x_vec);
-    tuner.AddArgumentScalar(0); // x_offset
-    tuner.AddArgumentScalar(1); // x_increment
-    tuner.AddArgumentInput(y_vec);
-    tuner.AddArgumentScalar(0); // y_offset
-    tuner.AddArgumentScalar(1); // y_increment
-    tuner.AddArgumentOutput(a_mat);
-    tuner.AddArgumentScalar(0); // a_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.m)); // a_ld
-    tuner.AddArgumentScalar(0); // a_is_rowmajor
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, GetRealArg(args.alpha));
+    kernel.SetArgument(3, buffers[0]()); // 0 == X vector
+    kernel.SetArgument(4, 0); // x_offset
+    kernel.SetArgument(5, 1); // x_increment
+    kernel.SetArgument(6, buffers[1]()); // 1 == Y vector
+    kernel.SetArgument(7, 0); // y_offset
+    kernel.SetArgument(8, 1); // y_increment
+    kernel.SetArgument(9, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(10, 0); // a_offset
+    kernel.SetArgument(11, static_cast<int>(args.m)); // a_ld
+    kernel.SetArgument(12, 0); // a_is_rowmajor
   }
 };
 
-- 
cgit v1.2.3


From 8d2f7d53aa28af09fb86324504056789fe753b4b Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 19 Nov 2017 12:59:28 +0100
Subject: Added a library with common tuner sources to speed-up compilation

---
 CMakeLists.txt | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f051e441..d7f30906 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -384,26 +384,33 @@ if(TUNERS)
       src/utilities/utilities.hpp
       src/tuning/configurations.hpp
       src/tuning/tuning.hpp)
+  set(TUNERS_COMMON ${TUNERS_COMMON} ${TUNERS_HEADERS})
+
+  # Creates a library with common sources for all tuners
+  if(MSVC)
+    # Visual Studio requires the sources of non-exported objects/libraries
+  else()
+    # Creates the common performance-tests objects (requires CMake 2.8.8)
+    add_library(tuners_common_library OBJECT ${TUNERS_COMMON})
+
+    # Adds CLBlast's interface include paths because we can't link to CLBlast here
+    target_include_directories(tuners_common_library PRIVATE
+                               $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
+                               ${clblast_SOURCE_DIR} ${API_INCLUDE_DIRS})
+    set(TUNERS_COMMON $<TARGET_OBJECTS:tuners_common_library>)
+  endif()
 
   # Adds tuning executables
   foreach(KERNEL ${KERNELS})
-    add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} ${TUNERS_HEADERS}
-                   src/tuning/kernels/${KERNEL}.cpp)
-    target_include_directories(clblast_tuner_${KERNEL} PUBLIC
-                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
-                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
-                           ${API_INCLUDE_DIRS})
+    add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
     target_link_libraries(clblast_tuner_${KERNEL} ${API_LIBRARIES})
+    target_include_directories(clblast_tuner_${KERNEL} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
   foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
-    add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} ${TUNERS_HEADERS}
-                   src/tuning/routines/${ROUTINE_TUNER}.cpp)
-    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC
-                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
-                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
-                           ${API_INCLUDE_DIRS})
-    target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} ${API_LIBRARIES})
+    add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
+    target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
+    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
     install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
   endforeach()
 
-- 
cgit v1.2.3


From 76d2b7f0b6fecb81ddc6912f5aae3e1ee9b89b29 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 19 Nov 2017 12:59:52 +0100
Subject: Revived the GEMM routine tuner; minor formatting changes

---
 CHANGELOG                     |  1 +
 src/tuning/routines/xgemm.cpp | 39 ++++++++++++++++++++++++++-------------
 src/tuning/tuning.cpp         |  2 +-
 src/tuning/tuning.hpp         |  2 +-
 src/utilities/timing.cpp      |  4 ++--
 src/utilities/timing.hpp      | 10 +++++++---
 6 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 200fdc53..95508951 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,6 @@
 
 Development (next version)
+- Re-designed and integrated the auto-tuner, no more dependency on CLTune
 - Added tuned parameters for various devices (see README)
 
 Version 1.2.0
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index a880c97e..cd22137a 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -18,7 +18,7 @@
 #include <assert.h>
 
 #include "utilities/utilities.hpp"
-#include "utilities/timing.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -68,7 +68,7 @@ void TuneXgemm(int argc, char* argv[]) {
   const auto platform = Platform(platform_id);
   const auto device = Device(platform, device_id);
   if (!PrecisionSupported<T>(device)) {
-    printf("* Unsupported precision, skipping this tuning run\n\n");
+    printf("* Unsupported precision, skipping this tuning run\n");
     return;
   }
   const auto context = Context(device);
@@ -81,18 +81,18 @@ void TuneXgemm(int argc, char* argv[]) {
   auto buffers = std::vector<Buffer<T>>{a_mat, b_mat, c_mat};
 
   // In-direct version
-  printf("[----------] Testing the in-direct GEMM routine for m=n=k\n");
+  printf("\n* Testing the in-direct GEMM routine for m=n=k\n");
   ForceSelectIndirectFrom<T>(0, device);
   const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
 
   // Direct version
-  printf("[----------] Testing the direct GEMM routine for m=n=k\n");
+  printf("\n* Testing the direct GEMM routine for m=n=k\n");
   ForceSelectIndirectFrom<T>(to * to * to + 1, device);
   const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
 
   // Determining final score and best kernel selection point
   assert(indirect.size() == direct.size());
-  printf("[----------] Collecting results\n");
+  printf("\n* Collecting results\n");
   auto ratios = std::vector<double>(indirect.size());
   for (auto i = size_t{0}; i < indirect.size(); ++i) {
     ratios[i] = indirect[i].second / direct[i].second;
@@ -104,42 +104,55 @@ void TuneXgemm(int argc, char* argv[]) {
     for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); }
     const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones
     const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1);
+    auto tuning_results = Configuration();
+    tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = indirect[i].first;
+    tuning_results["PRECISION"] = static_cast<size_t>(precision);
     scores[i] = TuningResult{
         "gemm_kernel_selection",
         (relative_score * relative_score) * 100 + epsilon,  // squared for proper default computation
-        TuningParameters{
-            TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first},
-            TuningParameter{"PRECISION", static_cast<size_t>(precision)}
-        }
+        tuning_results
     };
   }
 
   // Displaying results
-  printf("[ -------> ]   value indirect   direct    score (lowest means best switching point)\n");
+  printf("|   value |    indirect |      direct |  score   | (lowest score == best switching point)\n");
+  printf("x---------x-------------x-------------x----------x\n");
   for (auto i = size_t{0}; i < indirect.size(); ++i) {
     assert(indirect[i].first == direct[i].first);
     const auto value = indirect[i].first;
     if (indirect[i].second != -1 && direct[i].second != -1) {
       const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6);
       const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6);
-      printf("[ -------> ] %7zu %8.2lf %8.2lf %8.2lf\n",
+      printf("| %7zu | %8.2lf ms | %8.2lf ms | %8.3lf |\n",
              value, gflops_indirect, gflops_direct, scores[i].score);
     }
   }
+  printf("x---------x-------------x-------------x----------x\n");
+  printf("\n");
+
+  // Computes the best switching point
+  auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
+  const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison);
+  const auto best_switching_point = best_configuration->config["XGEMM_MIN_INDIRECT_SIZE"];
+  const auto best_string = "XGEMM_MIN_INDIRECT_SIZE=" + ToString(best_switching_point);
 
   // Outputs the results as JSON to disk, including some meta-data
   const auto precision_string = std::to_string(static_cast<size_t>(precision));
   auto metadata = std::vector<std::pair<std::string,std::string>>{
       {"kernel_family", "gemm_routine"},
+      {"precision", precision_string},
       {"arg_from", ToString(from)},
       {"arg_to", ToString(to)},
       {"arg_step", ToString(step)},
-      {"precision", precision_string},
+      {"best_kernel", best_configuration->name},
+      {"best_time", ToString(best_configuration->score)},
+      {"best_parameters", best_string}
   };
   PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json",
                            device, platform, metadata, scores);
 
-  printf("[  STATUS  ] All done\n");
+  printf("* Completed tuning process\n");
+  printf("\n");
 }
 
 // =================================================================================================
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index bd8337b4..5db7d2fb 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -77,7 +77,7 @@ void PrintTimingsToFileAsJSON(const std::string &filename,
 void print_separator(const size_t parameters_size) {
   printf("x------x-------x");
   for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); }
-  printf("-x----------x------------x--------x-------------------x\n");
+  printf("-x----------x--------------x--------x-------------------x\n");
 }
 
 // =================================================================================================
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 41f394c1..95464001 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -211,7 +211,7 @@ void Tuner(int argc, char* argv[]) {
   printf("\n");
   printf("|   ID | total |");
   for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf("     "); }
-  printf("param | compiles |       time | %6s |            status |\n", settings.performance_unit.c_str());
+  printf("param | compiles |         time | %6s |            status |\n", settings.performance_unit.c_str());
   print_separator(settings.parameters.size());
 
   // First runs a reference example to compare against
diff --git a/src/utilities/timing.cpp b/src/utilities/timing.cpp
index 188e4487..af6a8ff2 100644
--- a/src/utilities/timing.cpp
+++ b/src/utilities/timing.cpp
@@ -65,12 +65,12 @@ double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Dev
                   std::vector<size_t> global, const std::vector<size_t> &local) {
   try {
     const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local);
-    printf(" %7.2lf ms |", time_ms);
+    printf(" %9.2lf ms |", time_ms);
     return time_ms;
   }
   catch (...) {
     const auto status_code = DispatchExceptionCatchAll(true);
-    printf("  error %3d |", static_cast<int>(status_code));
+    printf("  error %-5d |", static_cast<int>(status_code));
     return -1.0; // invalid
   }
 }
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index e8040058..a66aba4b 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -55,19 +55,23 @@ std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t
                                 const size_t num_runs, const Queue& queue,
                                 const std::vector<Buffer<T>>& buffers, F const &routine) {
   auto timings = std::vector<Timing>();
+  printf("|  value |         time |\n");
+  printf("x--------x--------------x\n");
   for (auto value = from; value < to; value += step) {
-    printf("[ RUN      ] Running with value %zu\n", value);
+    printf("| %6zu |", value);
     try {
       const auto FunctionToTune = [&]() { routine(value, queue, buffers); };
       const auto time_ms = TimeFunction(num_runs, FunctionToTune);
-      printf("[       OK ] Took %.2lf ms\n", time_ms);
+      printf(" %9.2lf ms |\n", time_ms);
       timings.push_back({value, time_ms});
     }
     catch (...) {
-      printf("[    ERROR ] Exception caught\n");
+      const auto status_code = DispatchExceptionCatchAll(true);
+      printf("  error %-5d |\n", static_cast<int>(status_code));
       timings.push_back({value, -1.0}); // invalid
     }
   }
+  printf("x--------x--------------x\n");
   return timings;
 }
 
-- 
cgit v1.2.3


From c6690df8962dc48112558c09531eeda9d93d1e97 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 19 Nov 2017 14:33:25 +0100
Subject: Made the tuners be compiled by default

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7f30906..139e230e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ set(clblast_VERSION_PATCH 0)
 # Options and their default values
 option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON)
 option(SAMPLES "Enable compilation of the examples" OFF)
-option(TUNERS "Enable compilation of the tuners" OFF)
+option(TUNERS "Enable compilation of the tuners" ON)
 option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
 option(TESTS "Enable compilation of the correctness tests" OFF)
 option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
-- 
cgit v1.2.3


From a3a8b44f598b3eca18ab226112bf5c2bc3b19df8 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 19 Nov 2017 16:31:08 +0100
Subject: Some fixed for the new auto-tuner to be compatible with the Python
 scripts

---
 scripts/database/database/clblast.py |  1 -
 scripts/database/database/io.py      |  8 ++++++++
 src/clpp11.hpp                       |  7 +++++++
 src/cupp11.hpp                       |  3 +++
 src/tuning/tuning.cpp                | 10 +++++++---
 src/tuning/tuning.hpp                |  3 ++-
 6 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
index 2b4f734c..1a541fff 100644
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@@ -173,7 +173,6 @@ def print_cpp_database(database, output_dir):
                                 kernels = sorted(set([s["kernel"] for s in device_database]))
                                 for kernel in kernels:
                                     kernel_database = [s for s in device_database if s["kernel"] == kernel]
-
                                     assert len(kernel_database) == 1
                                     results = kernel_database[0]["results"]
 
diff --git a/scripts/database/database/io.py b/scripts/database/database/io.py
index 15a39cc1..29d47591 100644
--- a/scripts/database/database/io.py
+++ b/scripts/database/database/io.py
@@ -83,6 +83,14 @@ def load_tuning_results(filename):
     # Removes the numbering following the kernel family name
     json_data["kernel_family"] = re.sub(r'_\d+', '', json_data["kernel_family"])
 
+    # Removes unnecessary data
+    if json_data["best_kernel"]:
+        del json_data["best_kernel"]
+    if json_data["best_time"]:
+        del json_data["best_time"]
+    if json_data["best_parameters"]:
+        del json_data["best_parameters"]
+
     # Adds the kernel name to the section instead of to the individual results
     assert len(json_data["results"]) > 0
     json_data["kernel"] = json_data["results"][0]["kernel"]
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 82fc44fd..0db64ad9 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -352,6 +352,13 @@ class Device {
            std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV));
   }
 
+  // Retrieves the above extra information (if present)
+  std::string GetExtraInfo() const {
+    if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); }
+    if (HasExtension("cl_nv_device_attribute_query")) { return NVIDIAComputeCapability(); }
+    else { return std::string{""}; }
+  }
+
   // Accessor to the private data-member
   const RawDeviceID& operator()() const { return device_; }
  private:
diff --git a/src/cupp11.hpp b/src/cupp11.hpp
index ec21c5b1..00337ebd 100644
--- a/src/cupp11.hpp
+++ b/src/cupp11.hpp
@@ -326,6 +326,9 @@ public:
   std::string AMDBoardName() const { return ""; }
   std::string NVIDIAComputeCapability() const { return Capabilities(); }
 
+  // Retrieves the above extra information
+  std::string GetExtraInfo() const { return NVIDIAComputeCapability(); }
+
   // Accessor to the private data-member
   const RawDeviceID& operator()() const { return device_; }
 private:
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index 5db7d2fb..6804fbce 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -37,13 +37,17 @@ void PrintTimingsToFileAsJSON(const std::string &filename,
   for (auto &datum: metadata) {
     fprintf(file, "  \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
   }
-  fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
-  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
-  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
   fprintf(file, "  \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
+  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
   fprintf(file, "  \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
+  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
+  fprintf(file, "  \"device\": \"%s\",\n", device.Name().c_str());
+  fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
+  fprintf(file, "  \"device_vendor\": \"%s\",\n", platform.Vendor().c_str());
+  fprintf(file, "  \"device_type\": \"%s\",\n", device.Type().c_str());
   fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
   fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
+  fprintf(file, "  \"device_extra_info\": \"%s\",\n", device.GetExtraInfo().c_str());
   fprintf(file, "  \"results\": [\n");
 
   // Loops over all results
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 95464001..80d71e06 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -260,7 +260,7 @@ void Tuner(int argc, char* argv[]) {
   for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) {
     try {
 
-      const auto configuration = configurations[config_id];
+      auto configuration = configurations[config_id];
       printf("| %4zu | %5zu |", config_id + 1, configurations.size());
       for (const auto& parameter : settings.parameters) {
         printf("%5zu", configuration.at(parameter.first));
@@ -321,6 +321,7 @@ void Tuner(int argc, char* argv[]) {
       }
 
       // All was OK
+      configuration["PRECISION"] = static_cast<size_t>(args.precision);
       results.push_back(TuningResult{settings.kernel_name, time_ms, configuration});
       printf(" %6.1lf |", settings.metric_amount / (time_ms * 1.0e6));
       printf("     %sresults match%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str());
-- 
cgit v1.2.3


From 4e0d08c3bcf1816984934fcb211355590564615f Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 19 Nov 2017 16:58:13 +0100
Subject: Added compilation timing and better compilation error reporting

---
 src/tuning/tuning.cpp     |  2 +-
 src/tuning/tuning.hpp     | 20 ++++++++++++++------
 src/utilities/compile.cpp |  6 +++---
 src/utilities/compile.hpp |  2 +-
 4 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index 6804fbce..0af17a6f 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -81,7 +81,7 @@ void PrintTimingsToFileAsJSON(const std::string &filename,
 void print_separator(const size_t parameters_size) {
   printf("x------x-------x");
   for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); }
-  printf("-x----------x--------------x--------x-------------------x\n");
+  printf("-x----------------x--------------x--------x-------------------x\n");
 }
 
 // =================================================================================================
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 80d71e06..2c7f6a0b 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -21,6 +21,7 @@
 #include <utility>
 #include <algorithm>
 #include <iostream>
+#include <chrono>
 
 #include "utilities/utilities.hpp"
 #include "utilities/compile.hpp"
@@ -211,7 +212,7 @@ void Tuner(int argc, char* argv[]) {
   printf("\n");
   printf("|   ID | total |");
   for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf("     "); }
-  printf("param | compiles |         time | %6s |            status |\n", settings.performance_unit.c_str());
+  printf("param |       compiles |         time | %6s |            status |\n", settings.performance_unit.c_str());
   print_separator(settings.parameters.size());
 
   // First runs a reference example to compare against
@@ -232,7 +233,7 @@ void Tuner(int argc, char* argv[]) {
                                            device, context, compiler_options);
     auto kernel = Kernel(program, settings.kernel_name);
     C::SetArguments(kernel, args, device_buffers);
-    printf("       %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str());
+    printf("             %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str());
 
     // Runs the kernel
     const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device,
@@ -248,7 +249,6 @@ void Tuner(int argc, char* argv[]) {
   }
   catch (...) {
     const auto status_code = DispatchExceptionCatchAll(true);
-    printf(" %d |\n", static_cast<int>(status_code));
     printf("* Exception caught with status %d while running the reference, aborting\n",
            static_cast<int>(status_code));
     return;
@@ -286,11 +286,14 @@ void Tuner(int argc, char* argv[]) {
       kernel_source += settings.sources;
 
       // Compiles the kernel
+      const auto start_time = std::chrono::steady_clock::now();
       auto compiler_options = std::vector<std::string>();
       const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name,
-                                             device, context, compiler_options);
+                                             device, context, compiler_options, true);
       auto kernel = Kernel(program, settings.kernel_name);
-      printf("       %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str());
+      const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+      const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+      printf("   %sOK%s  %5.0lf ms |", kPrintSuccess.c_str(), kPrintEnd.c_str(), timing);
 
       // Runs the kernel
       C::SetArguments(kernel, args, device_buffers);
@@ -326,6 +329,12 @@ void Tuner(int argc, char* argv[]) {
       printf(" %6.1lf |", settings.metric_amount / (time_ms * 1.0e6));
       printf("     %sresults match%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str());
     }
+    catch (const CLCudaAPIBuildError &e) {
+      const auto status_code = DispatchExceptionCatchAll(true);
+      printf("  %scompilation error: %5d%s     |",
+             kPrintError.c_str(), static_cast<int>(status_code), kPrintEnd.c_str());
+      printf("      - |                 - | <-- skipping\n");
+    }
     catch (...) {
       const auto status_code = DispatchExceptionCatchAll(true);
       if (status_code != StatusCode::kUnknownError) {
@@ -384,7 +393,6 @@ void Tuner(int argc, char* argv[]) {
 
   printf("* Completed tuning process\n");
   printf("\n");
- 
 }
 
 // =================================================================================================
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index 3c02d316..2a55506e 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -23,7 +23,7 @@ namespace clblast {
 Program CompileFromSource(const std::string &source_string, const Precision precision,
                           const std::string &routine_name,
                           const Device& device, const Context& context,
-                          std::vector<std::string>& options) {
+                          std::vector<std::string>& options, const bool silent) {
   auto header_string = std::string{""};
 
   header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
@@ -78,8 +78,8 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
   try {
     program.Build(device, options);
   } catch (const CLCudaAPIBuildError &e) {
-    if (program.StatusIsCompilationWarningOrError(e.status())) {
-      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
+    if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) {
+      fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
               program.GetBuildInfo(device).c_str());
     }
     throw;
diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp
index bd4686eb..0315d70c 100644
--- a/src/utilities/compile.hpp
+++ b/src/utilities/compile.hpp
@@ -27,7 +27,7 @@ namespace clblast {
 Program CompileFromSource(const std::string &source_string, const Precision precision,
                           const std::string &routine_name,
                           const Device& device, const Context& context,
-                          std::vector<std::string>& options);
+                          std::vector<std::string>& options, const bool silent = false);
 
 // =================================================================================================
 } // namespace clblast
-- 
cgit v1.2.3


From defad3d1a249dd5f8c011cf28cc3c888d710d56a Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 19 Nov 2017 18:19:21 +0100
Subject: Minor fix to the database script

---
 scripts/database/database.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/database/database.py b/scripts/database/database.py
index 8f3ccce6..28f6ebf8 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -127,7 +127,7 @@ def main(argv):
     # Removes database entries before continuing
     if cl_args.remove_device is not None:
         print("[database] Removing all results for device '%s'" % cl_args.remove_device)
-        remove_database_entries(database, {"clblast_device": cl_args.remove_device})
+        remove_database_entries(database, {"clblast_device_name": cl_args.remove_device})
         io.save_database(database, database_filename)
 
     # Retrieves the best performing results
-- 
cgit v1.2.3