Merge pull request #216 from CNugteren/integrated_tuner

Integrated tuner
author: Cedric Nugteren <web@cedricnugteren.nl> 2017-11-19 20:05:15 +0100
committer: GitHub <noreply@github.com> 2017-11-19 20:05:15 +0100
commit: da76d7ab81555452a1049eb1a6d130073427067d (patch)
tree: 92439d8bee44c34d63f288a73bdc372ba84dc42b /src
parent: c41d219ea42087c1b8d933b733b381005123cb91 (diff)
parent: defad3d1a249dd5f8c011cf28cc3c888d710d56a (diff)
27 files changed, 1079 insertions, 549 deletions
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 82fc44fd..0db64ad9 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -352,6 +352,13 @@ class Device {
            std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV));
   }
 
+  // Retrieves the above extra information (if present)
+  std::string GetExtraInfo() const {
+    if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); }
+    if (HasExtension("cl_nv_device_attribute_query")) { return NVIDIAComputeCapability(); }
+    else { return std::string{""}; }
+  }
+
   // Accessor to the private data-member
   const RawDeviceID& operator()() const { return device_; }
  private:
diff --git a/src/cupp11.hpp b/src/cupp11.hpp
index ec21c5b1..00337ebd 100644
--- a/src/cupp11.hpp
+++ b/src/cupp11.hpp
@@ -326,6 +326,9 @@ public:
   std::string AMDBoardName() const { return ""; }
   std::string NVIDIAComputeCapability() const { return Capabilities(); }
 
+  // Retrieves the above extra information
+  std::string GetExtraInfo() const { return NVIDIAComputeCapability(); }
+
   // Accessor to the private data-member
   const RawDeviceID& operator()() const { return device_; }
 private:
diff --git a/src/routine.cpp b/src/routine.cpp
index 81201eea..93882fbf 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -135,74 +135,21 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
     throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
   }
 
-  // Collects the parameters for this device in the form of defines, and adds the precision
+  // Collects the parameters for this device in the form of defines
   auto source_string = std::string{""};
   for (const auto &kernel_name : kernel_names_) {
     source_string += db_(kernel_name).GetDefines();
   }
-  source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
-
-  // Adds the name of the routine as a define
-  source_string += "#define ROUTINE_"+routine_name_+"\n";
-
-  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
-  // which it is known to work with all OpenCL platforms.
-  if (device_.IsNVIDIA() || device_.IsARM()) {
-    source_string += "#define USE_INLINE_KEYWORD 1\n";
-  }
-
-  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
-  // performance, but might result in a reduced accuracy.
-  if (device_.IsAMD() && device_.IsGPU()) {
-    source_string += "#define USE_CL_MAD 1\n";
-  }
-
-  // For specific devices, use staggered/shuffled workgroup indices.
-  if (device_.IsAMD() && device_.IsGPU()) {
-    source_string += "#define USE_STAGGERED_INDICES 1\n";
-  }
-
-  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
-  // performance through better cache behaviour
-  if (device_.IsARM() && device_.IsGPU()) {
-    source_string += "#define GLOBAL_MEM_FENCE 1\n";
-  }
-
-  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
-  #ifdef CUDA_API
-    source_string +=
-      #include "kernels/opencl_to_cuda.h"
-    ;
-  #endif
-
-  // Loads the common header (typedefs and defines and such)
-  source_string +=
-    #include "kernels/common.opencl"
-  ;
 
   // Adds routine-specific code to the constructed source string
   for (const char *s: source) {
     source_string += s;
   }
 
-  // Prints details of the routine to compile in case of debugging in verbose mode
-  #ifdef VERBOSE
-    printf("[DEBUG] Compiling routine '%s-%s' for device '%s'\n",
-           routine_name_.c_str(), ToString(precision_).c_str(), device_name.c_str());
-    const auto start_time = std::chrono::steady_clock::now();
-  #endif
+  // Completes the source and compiles the kernel
+  program_ = CompileFromSource(source_string, precision_, routine_name_,
+                               device_, context_, options);
 
-  // Compiles the kernel
-  program_ = Program(context_, source_string);
-  try {
-    program_.Build(device_, options);
-  } catch (const CLCudaAPIBuildError &e) {
-    if (program_.StatusIsCompilationWarningOrError(e.status())) {
-      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
-              program_.GetBuildInfo(device_).c_str());
-    }
-    throw;
-  }
 
   // Store the compiled binary and program in the cache
   BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name},
@@ -210,13 +157,6 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
 
   ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info},
                                  Program{ program_ });
-
-  // Prints the elapsed compilation time in case of debugging in verbose mode
-  #ifdef VERBOSE
-    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
-    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
-    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
-  #endif
 }
 
 // =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index bf3b1762..06d001d9 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -20,6 +20,7 @@
 #include <vector>
 
 #include "utilities/utilities.hpp"
+#include "utilities/compile.hpp"
 #include "database/database.hpp"
 
 namespace clblast {
diff --git a/src/tuning/configurations.cpp b/src/tuning/configurations.cpp
new file mode 100644
index 00000000..459d66b1
--- /dev/null
+++ b/src/tuning/configurations.cpp
@@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune).
+// This is only used for the optional tuner binaries and not part of the core of CLBlast.
+//
+// =================================================================================================
+
+#include <vector>
+#include <string>
+
+#include "tuning/configurations.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Finds all configurations. It also applies the user-defined constraints within.
+std::vector<Configuration> SetConfigurations(const std::vector<Parameter> parameters,
+                                             const Constraints& constraints) {
+  auto config = Configuration();
+  auto configurations = std::vector<Configuration>();
+  PopulateConfigurations(parameters, 0, config, configurations, constraints);
+  return configurations;
+}
+
+// Iterates recursively over all permutations of the user-defined parameters
+void PopulateConfigurations(const std::vector<Parameter> &parameters,
+                            const size_t index, const Configuration &config,
+                            std::vector<Configuration> &configuration,
+                            const Constraints& constraints) {
+
+  // End of the chain: all parameters are considered, store the resulting configuration if it is a
+  // valid one according to the constraints
+  if (index == parameters.size()) {
+    if (ValidConfiguration(config, constraints)) {
+      configuration.push_back(config);
+    }
+    return;
+  }
+
+  // This loop iterates over all values of the current parameter and calls this function recursively
+  Parameter parameter = parameters[index];
+  for (auto &value: parameter.second) {
+    auto config_copy = config;
+    config_copy[parameter.first] = value;
+    PopulateConfigurations(parameters, index+1, config_copy, configuration, constraints);
+  }
+}
+
+// Loops over all user-defined constraints to check whether or not the configuration is valid
+bool ValidConfiguration(const Configuration &config,
+                        const Constraints& constraints) {
+
+  // Iterates over all constraints
+  for (auto &constraint: constraints) {
+
+    // Finds the values of the parameters
+    auto values = std::vector<size_t>(constraint.parameters.size());
+    for (auto i=size_t{0}; i<constraint.parameters.size(); ++i) {
+      values[i] = config.at(constraint.parameters[i]);
+    }
+
+    // Checks this constraint for these values
+    if (!constraint.valid_if(values)) {
+      return false;
+    }
+  }
+
+  // Everything was OK: this configuration is valid
+  return true;
+}
+
+// Multiplies and/or dividers a thread configuration (local/global)
+std::vector<size_t> SetThreadConfiguration(const Configuration& config,
+                                           const std::vector<size_t> base,
+                                           const TransformVector& mul_config,
+                                           const TransformVector& div_config) {
+  auto result = base;
+  for (const auto &multipliers: mul_config) {
+    for (auto i = size_t{0}; i < multipliers.size(); ++i) {
+      result[i] *= config.at(multipliers[i]);
+    }
+  }
+  for (const auto &dividers: div_config) {
+    for (auto i = size_t{0}; i < dividers.size(); ++i) {
+      result[i] /= config.at(dividers[i]);
+    }
+  }
+  return result;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/configurations.hpp b/src/tuning/configurations.hpp
new file mode 100644
index 00000000..74679ff6
--- /dev/null
+++ b/src/tuning/configurations.hpp
@@ -0,0 +1,73 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune).
+// This is only used for the optional tuner binaries and not part of the core of CLBlast.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TUNING_CONFIGURATIONS_H_
+#define CLBLAST_TUNING_CONFIGURATIONS_H_
+
+#include <vector>
+#include <string>
+#include <map>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+using Configuration = std::map<std::string, size_t>;
+using Parameter = std::pair<std::string, std::vector<size_t>>;
+using TransformVector = std::vector<std::vector<std::string>>;
+
+// Helper structure holding a constraint on parameters. This constraint consists of a constraint
+// function object and a vector of parameter names represented as strings.
+using ConstraintFunction = std::function<bool(std::vector<size_t>)>;
+struct Constraint {
+  ConstraintFunction valid_if;
+  std::vector<std::string> parameters;
+};
+using Constraints = std::vector<Constraint>;
+
+// =================================================================================================
+
+// Initializes an empty configuration (vector of name/value pairs) and kicks-off the recursive
+// function to find all configurations. It also applies the user-defined constraints within.
+std::vector<Configuration> SetConfigurations(const std::vector<Parameter> parameters,
+                                             const Constraints& constraints);
+
+// Iterates recursively over all permutations of the user-defined parameters. This code creates
+// multiple chains, in which each chain selects a unique combination of values for all parameters.
+// At the end of each chain (when all parameters are considered), the function stores the result
+// into the configuration list.
+void PopulateConfigurations(const std::vector<Parameter> &parameters,
+                            const size_t index, const Configuration &config,
+                            std::vector<Configuration> &configuration,
+                            const Constraints& constraints);
+
+// Loops over all user-defined constraints to check whether or not the configuration is valid.
+// Assumes initially all configurations are valid, then returns false if one of the constraints has
+// not been met. Constraints consist of a user-defined function and a list of parameter names, which
+// are replaced by parameter values in this function.
+bool ValidConfiguration(const Configuration &config,
+                        const Constraints& constraints);
+
+// Processes multipliers and dividers to obtain the final thread configuration
+std::vector<size_t> SetThreadConfiguration(const Configuration& config,
+                                           const std::vector<size_t> base,
+                                           const TransformVector& mul_config,
+                                           const TransformVector& div_config);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TUNING_CONFIGURATIONS_H_
+#endif
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 068c5f1b..462107d3 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels.
+// This file uses the auto-tuner to tune the copy OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TuneCopy {
     settings.kernel_family = "copy";
     settings.kernel_name = "CopyMatrixFast";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/copy_fast.opencl"
     ;
@@ -51,6 +50,10 @@ class TuneCopy {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,20 +81,15 @@ class TuneCopy {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(2, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(3, GetRealArg(args.alpha));
   }
 };
 
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 7102d05d..24557517 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels.
+// This file uses the auto-tuner to tune the pad OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TunePad {
     settings.kernel_family = "pad";
     settings.kernel_name = "CopyPadMatrix";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/copy_pad.opencl"
     ;
@@ -51,6 +50,10 @@ class TunePad {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,28 +81,23 @@ class TunePad {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(0);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.m));
+    kernel.SetArgument(3, 0);
+    kernel.SetArgument(4, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(5, static_cast<int>(args.m));
+    kernel.SetArgument(6, static_cast<int>(args.n));
+    kernel.SetArgument(7, static_cast<int>(args.m));
+    kernel.SetArgument(8, 0);
+    kernel.SetArgument(9, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(10, GetRealArg(args.alpha));
+    kernel.SetArgument(11, 0);
   }
 };
 
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 56726903..1e0d3c7b 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels.
+// This file uses the auto-tuner to tune the transpose OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TuneTranspose {
     settings.kernel_family = "transpose";
     settings.kernel_name = "TransposeMatrixFast";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/transpose_fast.opencl"
     ;
@@ -51,6 +50,10 @@ class TuneTranspose {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,25 +81,15 @@ class TuneTranspose {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"});
-  }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(2, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(3, GetRealArg(args.alpha));
   }
 };
 
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index dc46e903..087f8e67 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels.
+// This file uses the auto-tuner to tune the pad-transpose OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TunePadTranspose {
     settings.kernel_family = "padtranspose";
     settings.kernel_name = "TransposePadMatrix";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/transpose_pad.opencl"
     ;
@@ -51,6 +50,10 @@ class TunePadTranspose {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -77,33 +80,23 @@ class TunePadTranspose {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"});
-  }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(0);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.m));
+    kernel.SetArgument(3, 0);
+    kernel.SetArgument(4, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(5, static_cast<int>(args.n));
+    kernel.SetArgument(6, static_cast<int>(args.m));
+    kernel.SetArgument(7, static_cast<int>(args.n));
+    kernel.SetArgument(8, 0);
+    kernel.SetArgument(9, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(10, GetRealArg(args.alpha));
+    kernel.SetArgument(11, 0);
   }
 };
 
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index e201949a..d843ea78 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xaxpy OpenCL kernels.
+// This file uses the auto-tuner to tune the xaxpy OpenCL kernels.
 //
 // =================================================================================================
 
@@ -41,7 +41,6 @@ class TuneXaxpy {
     settings.kernel_family = "xaxpy";
     settings.kernel_name = "XaxpyFastest";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level1/level1.opencl"
 #include "../src/kernels/level1/xaxpy.opencl"
     ;
@@ -50,6 +49,10 @@ class TuneXaxpy {
     settings.size_x = args.n;
     settings.size_y = args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1};
+    settings.outputs = {1};
+
     // Sets the base thread configuration
     settings.global_size = {args.n};
     settings.global_size_ref = settings.global_size;
@@ -80,20 +83,15 @@ class TuneXaxpy {
       throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW");
     }
   }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentInput(x_vec);
-    tuner.AddArgumentOutput(y_vec);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.n));
+    kernel.SetArgument(1, GetRealArg(args.alpha));
+    kernel.SetArgument(2, buffers[0]()); // 0 == X vector
+    kernel.SetArgument(3, buffers[1]()); // 1 == Y vector
   }
 };
 
diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp
index fb532680..12350657 100644
--- a/src/tuning/kernels/xdot.cpp
+++ b/src/tuning/kernels/xdot.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xdot OpenCL kernels. Note that the results are
+// This file uses the auto-tuner to tune the xdot OpenCL kernels. Note that the results are
 // not verified, since the result is not final and depends on the WGS2 parameter.
 //
 // =================================================================================================
@@ -42,7 +42,6 @@ class TuneXdot {
     settings.kernel_family = "xdot_"+std::to_string(V);
     settings.kernel_name = (V==1) ? "Xdot" : "XdotEpilogue";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level1/xdot.opencl"
     ;
 
@@ -51,6 +50,10 @@ class TuneXdot {
     settings.size_y = args.n;
     settings.size_temp = args.n; // Worst case
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1, 5};
+    settings.outputs = {}; // no output checking
+
     // Sets the base thread configuration
     settings.global_size = (V==1) ? std::vector<size_t>{2*64} : std::vector<size_t>{1};
     settings.global_size_ref = (V==1) ? std::vector<size_t>{2*64*64} : std::vector<size_t>{64};
@@ -58,8 +61,8 @@ class TuneXdot {
     settings.local_size_ref = {64};
 
     // Transforms the thread configuration based on the parameters
-    settings.mul_local = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}};
-    settings.mul_global = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}};
+    settings.mul_local = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}};
+    settings.mul_global = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}};
 
     // Sets the tuning parameters and their possible values
     settings.parameters = {
@@ -75,31 +78,26 @@ class TuneXdot {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &temp) {
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
     if (V == 1) {
-      tuner.AddArgumentScalar(static_cast<int>(args.n));
-      tuner.AddArgumentInput(x_vec);
-      tuner.AddArgumentScalar(0);
-      tuner.AddArgumentScalar(1);
-      tuner.AddArgumentInput(y_vec);
-      tuner.AddArgumentScalar(0);
-      tuner.AddArgumentScalar(1);
-      tuner.AddArgumentInput(temp); // No output checking for the result - size varies
-      tuner.AddArgumentScalar(static_cast<int>(false));
+      kernel.SetArgument(0, static_cast<int>(args.n));
+      kernel.SetArgument(1, buffers[0]()); // 0 == X vector
+      kernel.SetArgument(2, 0);
+      kernel.SetArgument(3, 1);
+      kernel.SetArgument(4, buffers[1]()); // 1 == Y vector
+      kernel.SetArgument(5, 0);
+      kernel.SetArgument(6, 1);
+      kernel.SetArgument(7, buffers[5]()); // 5 == temp; no output checking - size varies
+      kernel.SetArgument(8, static_cast<int>(false));
     }
     else {
-      tuner.AddArgumentInput(temp);
-      tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere
-      tuner.AddArgumentScalar(0);
+      kernel.SetArgument(0, buffers[5]()); // 5 == temp
+      kernel.SetArgument(1, buffers[0]()); // 0 == X vector; no output checking - size varies
+      kernel.SetArgument(2, 0);
     }
   }
 };
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 6dcdf68b..16e32988 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. There are two variations:
+// This file uses the auto-tuner to tune the xgemm OpenCL kernels. There are two variations:
 // - V==1: This tests some limited set of tuning parameters exhaustively.
 // - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
 //
@@ -38,7 +38,6 @@ class TuneXgemm {
     settings.default_k = 1024;
     settings.default_fraction = (V==1) ? 1.0 : 512.0; // test all or sample randomly
     settings.default_num_runs = 2;
-    settings.default_heuristic = static_cast<size_t>(cltune::SearchMethod::RandomSearch);
     return settings;
   }
 
@@ -50,7 +49,6 @@ class TuneXgemm {
     settings.kernel_family = (V==1) ? "xgemm_1" : "xgemm_2";
     settings.kernel_name = "Xgemm";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/xgemm_part1.opencl"
 #include "../src/kernels/level3/xgemm_part2.opencl"
 #include "../src/kernels/level3/xgemm_part3.opencl"
@@ -61,6 +59,10 @@ class TuneXgemm {
     settings.size_b = args.n * args.k;
     settings.size_c = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3, 4};
+    settings.outputs = {4};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -114,74 +116,51 @@ class TuneXgemm {
     settings.metric_amount = 2 * args.m * args.n * args.k;
     settings.performance_unit = "GFLOPS";
 
-    // Returns which search heuristic to use
-    if (V==1) { settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); }
-    else {
-      // Use full-search to explore all parameter combinations or another strategy to search only a
-      // part of the parameter values. The fraction is set as a command-line argument.
-      if (args.fraction == 1.0 || args.fraction == 0.0) {
-        settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
-      } else {
-        settings.heuristic = args.heuristic_selection;
-      }
-    }
-
     return settings;
   }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints
-  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+  static std::vector<Constraint> SetConstraints() {
+    auto constraints = std::vector<Constraint>();
     auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
     auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
     auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
     // Requirement for unrolling the KWG loop
-    tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"});
+    constraints.push_back({MultipleOfX, {"KWG", "KWI"}});
     // Required for integer MWI and NWI
-    tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"});
+    constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}});
+    constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}});
     // Required for integer MWIA and NWIB
-    tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"});
+    constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}});
+    constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}});
     // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...)
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"});
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"});
+    constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}});
+    constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}});
 
     // Extra constraints for variation 1 to limit the set of options significantly
     if (V==1) {
       auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
-      tuner.AddConstraint(id, IsEqual, {"MDIMC", "MDIMA"});
-      tuner.AddConstraint(id, IsEqual, {"NDIMC", "NDIMB"});
-      tuner.AddConstraint(id, IsEqual, {"SA", "SB"});
+      constraints.push_back({IsEqual, {"MDIMC", "MDIMA"}});
+      constraints.push_back({IsEqual, {"NDIMC", "NDIMB"}});
+      constraints.push_back({IsEqual, {"SA", "SB"}});
     }
-  }
-
-  // Sets the local memory size
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG",
-                                                    "SB", "KWG", "NWG"});
+    return constraints;
   }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.k));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(GetRealArg(args.beta));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentInput(b_mat);
-    tuner.AddArgumentOutput(c_mat);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(0);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.k));
+    kernel.SetArgument(3, GetRealArg(args.alpha));
+    kernel.SetArgument(4, GetRealArg(args.beta));
+    kernel.SetArgument(5, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(6, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(7, buffers[4]()); // 4 == C matrix
+    kernel.SetArgument(8, 0);
+    kernel.SetArgument(9, 0);
   }
 };
 
diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp
index 619fb37a..60a983b4 100644
--- a/src/tuning/kernels/xgemm_direct.cpp
+++ b/src/tuning/kernels/xgemm_direct.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations:
+// This file uses the auto-tuner to tune the direct xgemm kernels. There are two variations:
 // - V==1: This tests some limited set of tuning parameters exhaustively.
 // - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
 //
@@ -36,9 +36,8 @@ class TuneXgemmDirect {
     settings.default_m = 256;
     settings.default_n = 256;
     settings.default_k = 256;
-    settings.default_fraction = (V==1) ? 1.0 : 32.0; // test all or sample randomly
+    settings.default_fraction = (V==1) ? 1.0 : 64.0; // test all or sample randomly
     settings.default_num_runs = 4;
-    settings.default_heuristic = static_cast<size_t>(cltune::SearchMethod::RandomSearch);
     return settings;
   }
 
@@ -50,7 +49,6 @@ class TuneXgemmDirect {
     settings.kernel_family = (V==1) ? "xgemm_direct_1" : "xgemm_direct_2";
     settings.kernel_name = "XgemmDirectTN";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/xgemm_direct_part1.opencl"
 #include "../src/kernels/level3/xgemm_direct_part2.opencl"
 #include "../src/kernels/level3/xgemm_direct_part3.opencl"
@@ -61,6 +59,10 @@ class TuneXgemmDirect {
     settings.size_b = args.n * args.k;
     settings.size_c = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3, 4};
+    settings.outputs = {4};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -89,7 +91,7 @@ class TuneXgemmDirect {
     }
     else { // a lot more tuning parameters - has to be sampled randomly, too much to test all
       settings.parameters = {
-        {"WGD", {8, 16, 32, 64, 128}},
+        {"WGD", {8, 16, 32, 64}},
         {"MDIMCD", {8, 16, 32}},
         {"NDIMCD", {8, 16, 32}},
         {"MDIMAD", {8, 16, 32}},
@@ -106,79 +108,57 @@ class TuneXgemmDirect {
     settings.metric_amount = 2 * args.m * args.n * args.k;
     settings.performance_unit = "GFLOPS";
 
-    // Returns which search heuristic to use
-    if (V==1) { settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); }
-    else {
-      // Use full-search to explore all parameter combinations or another strategy to search only a
-      // part of the parameter values. The fraction is set as a command-line argument.
-      if (args.fraction == 1.0 || args.fraction == 0.0) {
-        settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
-      } else {
-        settings.heuristic = args.heuristic_selection;
-      }
-    }
-
     return settings;
   }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints
-  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+  static std::vector<Constraint> SetConstraints() {
+    auto constraints = std::vector<Constraint>();
     auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
     auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
     auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
     // Requirement for unrolling the WGD loop
-    tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"});
+    constraints.push_back({MultipleOfX, {"WGD", "KWID"}});
     // Required for integer MWID and NWID
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}});
     // Required for integer MWIAD and NWIBD
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}});
     // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...)
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"});
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"});
+    constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}});
+    constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}});
 
     // Extra constraints for variation 1 to limit the set of options significantly
     if (V==1) {
       auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
-      tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"});
-      tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"});
+      constraints.push_back({IsEqual, {"MDIMCD", "MDIMAD"}});
+      constraints.push_back({IsEqual, {"NDIMCD", "NDIMBD"}});
     }
-  }
-
-  // Sets the local memory size
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"});
+    return constraints;
   }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.k));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(GetRealArg(args.beta));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(0); // a_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld
-    tuner.AddArgumentInput(b_mat);
-    tuner.AddArgumentScalar(0); // b_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld
-    tuner.AddArgumentOutput(c_mat);
-    tuner.AddArgumentScalar(0); // c_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld
-    tuner.AddArgumentScalar(1); // c_do_transpose
-    tuner.AddArgumentScalar(0); // a_conjugate
-    tuner.AddArgumentScalar(0); // b_conjugate
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.k));
+    kernel.SetArgument(3, GetRealArg(args.alpha));
+    kernel.SetArgument(4, GetRealArg(args.beta));
+    kernel.SetArgument(5, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(6, 0); // a_offset
+    kernel.SetArgument(7, static_cast<int>(args.k)); // a_ld
+    kernel.SetArgument(8, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(9, 0); // b_offset
+    kernel.SetArgument(10, static_cast<int>(args.n)); // b_ld
+    kernel.SetArgument(11, buffers[4]()); // 4 == C matrix
+    kernel.SetArgument(12, 0); // c_offset
+    kernel.SetArgument(13, static_cast<int>(args.n)); // c_ld
+    kernel.SetArgument(14, 1); // c_do_transpose
+    kernel.SetArgument(15, 0); // a_conjugate
+    kernel.SetArgument(16, 0); // b_conjugate
   }
 };
 
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index e66b15f1..3eadd32b 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned:
+// This file uses the auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned:
 // 1: The full version of the kernel
 // 2: The fast version for non-transposed matrices
 // 3: The fast version for transposed matrices
@@ -45,7 +45,6 @@ class TuneXgemv {
     settings.kernel_family = (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot");
     settings.kernel_name = (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot");
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level2/xgemv.opencl"
 #include "../src/kernels/level2/xgemv_fast.opencl"
     ;
@@ -55,6 +54,10 @@ class TuneXgemv {
     settings.size_y = args.m;
     settings.size_a = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1, 2};
+    settings.outputs = {1};
+
     // Sets the base thread configuration
     settings.global_size = {args.m};
     settings.global_size_ref = settings.global_size;
@@ -63,9 +66,7 @@ class TuneXgemv {
 
     // Transforms the thread configuration based on the parameters
     settings.mul_local = {{"WGS"+std::to_string(V)}};
-    settings.div_global = (V==1 || V==2) ?
-                          TunerSettings::TransformVector{{"WPT"+std::to_string(V)}} :
-                          TunerSettings::TransformVector{};
+    settings.div_global = (V==1 || V==2) ? TransformVector{{"WPT"+std::to_string(V)}} : TransformVector{};
 
     // Sets the tuning parameters and their possible values
     if (V==1) {
@@ -98,53 +99,41 @@ class TuneXgemv {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+  static std::vector<Constraint> SetConstraints() {
+    auto constraints = std::vector<Constraint>();
     if (V==2 || V==3) {
       auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
-      tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
+      constraints.push_back({MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}});
     }
     if (V==3) {
       auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; };
-      tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
-    }
-  }
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    if (V==1 || V==2) {
-      auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
-      tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
-    }
-    else {
-      auto LocalMemorySize = [args] (std::vector<size_t> v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); };
-      tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
+      constraints.push_back({LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}});
     }
+    return constraints;
   }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &) {
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
     auto a_rotated = (V==3) ? 1 : 0;
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(GetRealArg(args.beta));
-    tuner.AddArgumentScalar(static_cast<int>(a_rotated));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentInput(x_vec);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(1);
-    tuner.AddArgumentOutput(y_vec);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(1);
-    tuner.AddArgumentScalar(0); // Conjugate transpose
-    tuner.AddArgumentScalar(0); // Additional parameter
-    tuner.AddArgumentScalar(0); // Banded 'kl'
-    tuner.AddArgumentScalar(0); // Banded 'ku'
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, GetRealArg(args.alpha));
+    kernel.SetArgument(3, GetRealArg(args.beta));
+    kernel.SetArgument(4, a_rotated);
+    kernel.SetArgument(5, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(6, 0);
+    kernel.SetArgument(7, static_cast<int>(args.m));
+    kernel.SetArgument(8, buffers[0]()); // 0 == X vector
+    kernel.SetArgument(9, 0);
+    kernel.SetArgument(10, 1);
+    kernel.SetArgument(11, buffers[1]()); // 1 == Y vector
+    kernel.SetArgument(12, 0);
+    kernel.SetArgument(13, 1);
+    kernel.SetArgument(14, 0); // Conjugate transpose
+    kernel.SetArgument(15, 0); // Additional parameter
+    kernel.SetArgument(16, 0); // Banded 'kl'
+    kernel.SetArgument(17, 0); // Banded 'ku'
   }
 };
 
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index c2eb1d31..745e553f 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels.
+// This file uses the auto-tuner to tune the xger OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TuneXger {
     settings.kernel_family = "xger";
     settings.kernel_name = "Xger";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level2/level2.opencl"
 #include "../src/kernels/level2/xger.opencl"
     ;
@@ -52,6 +51,10 @@ class TuneXger {
     settings.size_y = args.n;
     settings.size_a = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1, 2};
+    settings.outputs = {2};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,29 +81,24 @@ class TuneXger {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentInput(x_vec);
-    tuner.AddArgumentScalar(0); // x_offset
-    tuner.AddArgumentScalar(1); // x_increment
-    tuner.AddArgumentInput(y_vec);
-    tuner.AddArgumentScalar(0); // y_offset
-    tuner.AddArgumentScalar(1); // y_increment
-    tuner.AddArgumentOutput(a_mat);
-    tuner.AddArgumentScalar(0); // a_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.m)); // a_ld
-    tuner.AddArgumentScalar(0); // a_is_rowmajor
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, GetRealArg(args.alpha));
+    kernel.SetArgument(3, buffers[0]()); // 0 == X vector
+    kernel.SetArgument(4, 0); // x_offset
+    kernel.SetArgument(5, 1); // x_increment
+    kernel.SetArgument(6, buffers[1]()); // 1 == Y vector
+    kernel.SetArgument(7, 0); // y_offset
+    kernel.SetArgument(8, 1); // y_increment
+    kernel.SetArgument(9, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(10, 0); // a_offset
+    kernel.SetArgument(11, static_cast<int>(args.m)); // a_ld
+    kernel.SetArgument(12, 0); // a_is_rowmajor
   }
 };
 
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index a880c97e..cd22137a 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -18,7 +18,7 @@
 #include <assert.h>
 
 #include "utilities/utilities.hpp"
-#include "utilities/timing.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -68,7 +68,7 @@ void TuneXgemm(int argc, char* argv[]) {
   const auto platform = Platform(platform_id);
   const auto device = Device(platform, device_id);
   if (!PrecisionSupported<T>(device)) {
-    printf("* Unsupported precision, skipping this tuning run\n\n");
+    printf("* Unsupported precision, skipping this tuning run\n");
     return;
   }
   const auto context = Context(device);
@@ -81,18 +81,18 @@ void TuneXgemm(int argc, char* argv[]) {
   auto buffers = std::vector<Buffer<T>>{a_mat, b_mat, c_mat};
 
   // In-direct version
-  printf("[----------] Testing the in-direct GEMM routine for m=n=k\n");
+  printf("\n* Testing the in-direct GEMM routine for m=n=k\n");
   ForceSelectIndirectFrom<T>(0, device);
   const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
 
   // Direct version
-  printf("[----------] Testing the direct GEMM routine for m=n=k\n");
+  printf("\n* Testing the direct GEMM routine for m=n=k\n");
   ForceSelectIndirectFrom<T>(to * to * to + 1, device);
   const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
 
   // Determining final score and best kernel selection point
   assert(indirect.size() == direct.size());
-  printf("[----------] Collecting results\n");
+  printf("\n* Collecting results\n");
   auto ratios = std::vector<double>(indirect.size());
   for (auto i = size_t{0}; i < indirect.size(); ++i) {
     ratios[i] = indirect[i].second / direct[i].second;
@@ -104,42 +104,55 @@ void TuneXgemm(int argc, char* argv[]) {
     for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); }
     const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones
     const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1);
+    auto tuning_results = Configuration();
+    tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = indirect[i].first;
+    tuning_results["PRECISION"] = static_cast<size_t>(precision);
     scores[i] = TuningResult{
         "gemm_kernel_selection",
         (relative_score * relative_score) * 100 + epsilon,  // squared for proper default computation
-        TuningParameters{
-            TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first},
-            TuningParameter{"PRECISION", static_cast<size_t>(precision)}
-        }
+        tuning_results
     };
   }
 
   // Displaying results
-  printf("[ -------> ]   value indirect   direct    score (lowest means best switching point)\n");
+  printf("|   value |    indirect |      direct |  score   | (lowest score == best switching point)\n");
+  printf("x---------x-------------x-------------x----------x\n");
   for (auto i = size_t{0}; i < indirect.size(); ++i) {
     assert(indirect[i].first == direct[i].first);
     const auto value = indirect[i].first;
     if (indirect[i].second != -1 && direct[i].second != -1) {
       const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6);
       const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6);
-      printf("[ -------> ] %7zu %8.2lf %8.2lf %8.2lf\n",
+      printf("| %7zu | %8.2lf ms | %8.2lf ms | %8.3lf |\n",
              value, gflops_indirect, gflops_direct, scores[i].score);
     }
   }
+  printf("x---------x-------------x-------------x----------x\n");
+  printf("\n");
+
+  // Computes the best switching point
+  auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
+  const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison);
+  const auto best_switching_point = best_configuration->config["XGEMM_MIN_INDIRECT_SIZE"];
+  const auto best_string = "XGEMM_MIN_INDIRECT_SIZE=" + ToString(best_switching_point);
 
   // Outputs the results as JSON to disk, including some meta-data
   const auto precision_string = std::to_string(static_cast<size_t>(precision));
   auto metadata = std::vector<std::pair<std::string,std::string>>{
       {"kernel_family", "gemm_routine"},
+      {"precision", precision_string},
       {"arg_from", ToString(from)},
       {"arg_to", ToString(to)},
       {"arg_step", ToString(step)},
-      {"precision", precision_string},
+      {"best_kernel", best_configuration->name},
+      {"best_time", ToString(best_configuration->score)},
+      {"best_parameters", best_string}
   };
   PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json",
                            device, platform, metadata, scores);
 
-  printf("[  STATUS  ] All done\n");
+  printf("* Completed tuning process\n");
+  printf("\n");
 }
 
 // =================================================================================================
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
new file mode 100644
index 00000000..0af17a6f
--- /dev/null
+++ b/src/tuning/tuning.cpp
@@ -0,0 +1,88 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the generic CLBlast auto-tuner (inspired by CLTune). This is only used for
+//  the optional and stand-alone tuner binaries and not part of the core of CLBlast.
+//
+// =================================================================================================
+
+#include <vector>
+#include <string>
+#include <random>
+#include <utility>
+#include <algorithm>
+#include <iostream>
+
+#include "utilities/utilities.hpp"
+#include "tuning/tuning.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+void PrintTimingsToFileAsJSON(const std::string &filename,
+                              const Device& device, const Platform& platform,
+                              const std::vector<std::pair<std::string,std::string>> &metadata,
+                              const std::vector<TuningResult>& tuning_results) {
+  auto num_results = tuning_results.size();
+  printf("* Writing a total of %zu results to '%s'\n", num_results, filename.c_str());
+
+  auto file = fopen(filename.c_str(), "w");
+  fprintf(file, "{\n");
+  for (auto &datum: metadata) {
+    fprintf(file, "  \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
+  }
+  fprintf(file, "  \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
+  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
+  fprintf(file, "  \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
+  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
+  fprintf(file, "  \"device\": \"%s\",\n", device.Name().c_str());
+  fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
+  fprintf(file, "  \"device_vendor\": \"%s\",\n", platform.Vendor().c_str());
+  fprintf(file, "  \"device_type\": \"%s\",\n", device.Type().c_str());
+  fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
+  fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
+  fprintf(file, "  \"device_extra_info\": \"%s\",\n", device.GetExtraInfo().c_str());
+  fprintf(file, "  \"results\": [\n");
+
+  // Loops over all results
+  for (auto r = size_t{0}; r < num_results; ++r) {
+    auto result = tuning_results[r];
+    fprintf(file, "    {\n");
+    fprintf(file, "      \"kernel\": \"%s\",\n", result.name.c_str());
+    fprintf(file, "      \"time\": %.3lf,\n", result.score);
+
+    // Loops over all the parameters for this result
+    fprintf(file, "      \"parameters\": {");
+    auto num_configs = result.config.size();
+    auto p = size_t{0};
+    for (const auto parameter : result.config) {
+      fprintf(file, "\"%s\": %zu", parameter.first.c_str(), parameter.second);
+      if (p < num_configs -1 ) { fprintf(file, ","); }
+      ++p;
+    }
+    fprintf(file, "}\n");
+
+    // The footer
+    fprintf(file, "    }");
+    if (r < num_results - 1) { fprintf(file, ","); }
+    fprintf(file, "\n");
+  }
+  fprintf(file, "  ]\n");
+  fprintf(file, "}\n");
+  fclose(file);
+}
+
+void print_separator(const size_t parameters_size) {
+  printf("x------x-------x");
+  for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); }
+  printf("-x----------------x--------------x--------x-------------------x\n");
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index bc9c0e03..2c7f6a0b 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -7,26 +7,45 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the interface to the CLTune auto-tuner. This is only used for the optional
-// and stand-alone tuner binaries and not part of the core of CLBlast.
+// This file implements the generic CLBlast auto-tuner (inspired by CLTune). This is only used for
+//  the optional and stand-alone tuner binaries and not part of the core of CLBlast.
 //
 // =================================================================================================
 
-#ifndef CLBLAST_TUNING_H_
-#define CLBLAST_TUNING_H_
+#ifndef CLBLAST_TUNING_TUNING_H_
+#define CLBLAST_TUNING_TUNING_H_
 
 #include <vector>
 #include <string>
 #include <random>
 #include <utility>
-
-#include <cltune.h>
+#include <algorithm>
+#include <iostream>
+#include <chrono>
 
 #include "utilities/utilities.hpp"
+#include "utilities/compile.hpp"
+#include "utilities/timing.hpp"
+#include "tuning/configurations.hpp"
 
 namespace clblast {
 // =================================================================================================
 
+// Constants holding start and end strings for terminal-output in colour
+#if defined(_WIN32)
+  const std::string kPrintError = "";
+  const std::string kPrintSuccess = "";
+  const std::string kPrintMessage = "";
+  const std::string kPrintEnd = "";
+#else
+  const std::string kPrintError = "\x1b[31m";
+  const std::string kPrintSuccess = "\x1b[32m";
+  const std::string kPrintMessage = "\x1b[1m";
+  const std::string kPrintEnd = "\x1b[0m";
+#endif
+
+// =================================================================================================
+
 // Structures for the tuners with all the default settings
 struct TunerDefaults {
 
@@ -41,15 +60,7 @@ struct TunerDefaults {
   // Other defaults
   size_t default_batch_count = 1;
   size_t default_num_runs = 10; // run every kernel this many times for averaging
-
-  // Search heuristic defaults
   double default_fraction = 1.0;
-  size_t default_swarm_size_PSO = 8;
-  double default_influence_global_PSO = 0.1;
-  double default_influence_local_PSO = 0.3;
-  double default_influence_random_PSO = 0.6;
-  size_t default_heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
-  double default_max_temp_ann = 1.0;
 };
 
 // Structures for the tuners with the remaining settings
@@ -68,6 +79,10 @@ struct TunerSettings {
   size_t size_c = 1;
   size_t size_temp = 1;
 
+  // Inputs and outputs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+  std::vector<size_t> inputs = {};
+  std::vector<size_t> outputs = {};
+
   // Sets the base thread configuration
   std::vector<size_t> global_size = {};
   std::vector<size_t> global_size_ref = {};
@@ -75,25 +90,32 @@ struct TunerSettings {
   std::vector<size_t> local_size_ref = {};
 
   // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
   TransformVector mul_local = {};
   TransformVector div_local = {};
   TransformVector mul_global = {};
   TransformVector div_global = {};
 
   // Sets the tuning parameters and their possible values
-  std::vector<std::pair<std::string, std::vector<size_t>>> parameters;
+  std::vector<Parameter> parameters;
 
   // Describes how to compute the performance metrics
   size_t metric_amount = 0;
   std::string performance_unit = "N/A";
-
-  // Returns which search heuristic to use
-  size_t heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
 };
 
 // =================================================================================================
 
+struct TuningResult { std::string name; double score; Configuration config; };
+
+void PrintTimingsToFileAsJSON(const std::string &filename,
+                              const Device& device, const Platform& platform,
+                              const std::vector<std::pair<std::string,std::string>> &metadata,
+                              const std::vector<TuningResult>& tuning_results);
+
+void print_separator(const size_t parameters_size);
+
+// =================================================================================================
+
 // Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
 // the results. Used for all types of kernel families. Note that this is a header-only function so
 // that it is automatically compiled for the various kernels (given as the 'C' template argument).
@@ -115,147 +137,266 @@ void Tuner(int argc, char* argv[]) {
     if (o == kArgK)        { args.k        = GetArgument(command_line_args, help, kArgK, defaults.default_k); }
     if (o == kArgAlpha)    { args.alpha    = GetArgument(command_line_args, help, kArgAlpha, GetScalar<T>()); }
     if (o == kArgBeta)     { args.beta     = GetArgument(command_line_args, help, kArgBeta, GetScalar<T>()); }
-    if (o == kArgFraction) { args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction); }
     if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, defaults.default_batch_count); }
-    if (o == kArgHeuristicSelection) {args.heuristic_selection = GetArgument(command_line_args, help, kArgHeuristicSelection, defaults.default_heuristic);  }
-    if (o == kArgPsoSwarmSize)   {args.pso_swarm_size      = GetArgument(command_line_args, help, kArgPsoSwarmSize , defaults.default_swarm_size_PSO);  }
-    if (o == kArgPsoInfGlobal)   {args.pso_inf_global      = GetArgument(command_line_args, help, kArgPsoInfGlobal, defaults.default_influence_global_PSO);  }
-    if (o == kArgPsoInfLocal)    {args.pso_inf_local       = GetArgument(command_line_args, help, kArgPsoInfLocal, defaults.default_influence_local_PSO);  }
-    if (o == kArgPsoInfRandom)   {args.pso_inf_random      = GetArgument(command_line_args, help, kArgPsoInfRandom, defaults.default_influence_random_PSO);  }
-    if (o == kArgAnnMaxTemp)     {args.ann_max_temperature = GetArgument(command_line_args, help, kArgAnnMaxTemp, defaults.default_max_temp_ann); }
   }
-  const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs);
-  fprintf(stdout, "%s\n", help.c_str());
+  args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction);
+  args.num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs);
+  const auto max_l2_norm = GetArgument(command_line_args, help, kArgMaxL2Norm, 1.0e-4);
+  printf("%s\n", help.c_str());
   const TunerSettings settings = C::GetTunerSettings(args);
 
   // Tests validity of the given arguments
   C::TestValidArguments(args);
 
+  // Initializes OpenCL
+  const auto platform = Platform(args.platform_id);
+  const auto device = Device(platform, args.device_id);
+  const auto context = Context(device);
+  auto queue = Queue(context, device);
+
   // Tests for validity of the precision and retrieves properties
-  auto isAMD = false;
-  auto isARM = false;
-  auto isGPU = false;
-  auto device_type = std::string{};
-  auto device_vendor = std::string{};
-  auto device_architecture = std::string{};
-  auto device_name = std::string{};
-  { // In a block such that the platform and the device are destroyed before initializing the tuner
-    const auto platform = Platform(args.platform_id);
-    const auto device = Device(platform, args.device_id);
-    if (!PrecisionSupported<T>(device)) {
-      printf("* Unsupported precision, skipping this tuning run\n\n");
-      return;
-    }
-    isAMD = device.IsAMD();
-    isARM = device.IsARM();
-    isGPU = device.IsGPU();
-    device_type = GetDeviceType(device);
-    device_vendor = GetDeviceVendor(device);
-    device_architecture = GetDeviceArchitecture(device);
-    device_name = GetDeviceName(device);
+  if (!PrecisionSupported<T>(device)) {
+    printf("* Unsupported precision, skipping this tuning run\n\n");
+    return;
   }
+  const auto device_type = GetDeviceType(device);
+  const auto device_vendor = GetDeviceVendor(device);
+  const auto device_architecture = GetDeviceArchitecture(device);
+  const auto device_name = GetDeviceName(device);
 
   // Creates input buffers with random data
-  auto x_vec = std::vector<T>(settings.size_x);
-  auto y_vec = std::vector<T>(settings.size_y);
-  auto a_mat = std::vector<T>(settings.size_a);
-  auto b_mat = std::vector<T>(settings.size_b);
-  auto c_mat = std::vector<T>(settings.size_c);
-  auto temp = std::vector<T>(settings.size_temp);
+  const auto buffer_sizes = std::vector<size_t>{
+      settings.size_x, settings.size_y,
+      settings.size_a, settings.size_b, settings.size_c,
+      settings.size_temp
+  };
   std::mt19937 mt(kSeed);
   std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
-  PopulateVector(x_vec, mt, dist);
-  PopulateVector(y_vec, mt, dist);
-  PopulateVector(a_mat, mt, dist);
-  PopulateVector(b_mat, mt, dist);
-  PopulateVector(c_mat, mt, dist);
-  PopulateVector(temp, mt, dist);
-
-  // Initializes the tuner for the chosen device
-  cltune::Tuner tuner(args.platform_id, args.device_id);
-
-  // Select the search method based on the command-line arguments
-  // If the tuner does not support the selected choice, full search will be returned.
-  auto method = settings.heuristic;
-  if      (method == 1) { tuner.UseRandomSearch(1.0/args.fraction); }
-  else if (method == 2) { tuner.UseAnnealing(1.0/args.fraction, args.ann_max_temperature); }
-  else if (method == 3) { tuner.UsePSO(1.0/args.fraction, args.pso_swarm_size, args.pso_inf_global,
-                                       args.pso_inf_local, args.pso_inf_random); }
-  else                  { tuner.UseFullSearch(); }
-
-  // Set extra settings for specific defines. This mimics src/routine.cc.
-  auto defines = std::string{""};
-  if (isAMD && isGPU) {
-    defines += "#define USE_CL_MAD 1\n";
-    defines += "#define USE_STAGGERED_INDICES 1\n";
+  auto source_buffers = std::vector<std::vector<T>>();
+  auto reference_buffers = std::vector<std::vector<T>>();
+  auto result_buffers = std::vector<std::vector<T>>();
+  auto device_buffers = std::vector<Buffer<T>>();
+  for (const auto size : buffer_sizes) {
+    auto host_buffer = std::vector<T>(size);
+    PopulateVector(host_buffer, mt, dist);
+    source_buffers.push_back(host_buffer);
+    auto reference_buffer = std::vector<T>(size);
+    reference_buffers.push_back(reference_buffer);
+    auto result_buffer = std::vector<T>(size);
+    result_buffers.push_back(result_buffer);
+    auto device_buffer = Buffer<T>(context, size);
+    device_buffers.push_back(device_buffer);
   }
-  if (isARM && isGPU) {
-    defines += "#define GLOBAL_MEM_FENCE 1\n";
-  }
-
-  // Loads the kernel sources and defines the kernel to tune
-  auto sources = defines + settings.sources;
-  auto id = tuner.AddKernelFromString(sources, settings.kernel_name, settings.global_size, settings.local_size);
-  tuner.SetReferenceFromString(sources, settings.kernel_name, settings.global_size_ref, settings.local_size_ref);
 
   // Sets the tunable parameters and their possible values
-  for (const auto &parameter: settings.parameters) {
-    tuner.AddParameter(id, parameter.first, parameter.second);
+  auto configurations = SetConfigurations(settings.parameters, C::SetConstraints());
+  printf("* Found %s%zu configuration(s)%s\n",
+         kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str());
+
+  // Select the search method (full search or a random fraction)
+  if (args.fraction != 0.0 && args.fraction != 1.0) {
+    const auto new_size = static_cast<size_t>(configurations.size() / args.fraction);
+    auto rng = std::default_random_engine{};
+    std::shuffle(std::begin(configurations), std::end(configurations), rng);
+    configurations.resize(new_size);
+    printf("* Exploring a random subset of %s%zu configuration(s)%s\n",
+           kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str());
   }
-  C::SetConstraints(tuner, id);
-  C::SetLocalMemorySize(tuner, id, args);
 
-  // Tests for a specific precision
-  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
-  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+  // Prints information about the parameters
+  printf("* Parameters explored: ");
+  for (const auto& parameter : settings.parameters) { printf("%s ", parameter.first.c_str()); }
+  printf("\n");
+
+  // Prints the header of the table
+  printf("\n");
+  printf("|   ID | total |");
+  for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf("     "); }
+  printf("param |       compiles |         time | %6s |            status |\n", settings.performance_unit.c_str());
+  print_separator(settings.parameters.size());
+
+  // First runs a reference example to compare against
+  try {
+    printf("|  ref |     - |");
+    for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf("     "); }
+    printf("    - |");
 
-  // Modifies the thread-sizes (both global and local) based on the parameters
-  for (auto &parameters: settings.mul_local) { tuner.MulLocalSize(id, parameters); }
-  for (auto &parameters: settings.div_local) { tuner.DivLocalSize(id, parameters); }
-  for (auto &parameters: settings.mul_global) { tuner.MulGlobalSize(id, parameters); }
-  for (auto &parameters: settings.div_global) { tuner.DivGlobalSize(id, parameters); }
 
-  // Sets the function's arguments
-  C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
+    // Sets the input
+    for (const auto id : settings.inputs) {
+      device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
+    }
+
+    // Compiles the kernel
+    auto compiler_options = std::vector<std::string>();
+    const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name,
+                                           device, context, compiler_options);
+    auto kernel = Kernel(program, settings.kernel_name);
+    C::SetArguments(kernel, args, device_buffers);
+    printf("             %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str());
+
+    // Runs the kernel
+    const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device,
+                                    settings.global_size_ref, settings.local_size_ref);
+    printf("      - |");
+    if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); }
+
+    // Saves the result
+    for (const auto id : settings.outputs) {
+      device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]);
+    }
+    printf("      %sreference OK%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str());
+  }
+  catch (...) {
+    const auto status_code = DispatchExceptionCatchAll(true);
+    printf("* Exception caught with status %d while running the reference, aborting\n",
+           static_cast<int>(status_code));
+    return;
+  }
+  print_separator(settings.parameters.size());
 
   // Starts the tuning process
-  tuner.SetNumRuns(num_runs);
-  tuner.Tune();
+  auto results = std::vector<TuningResult>();
+  for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) {
+    try {
+
+      auto configuration = configurations[config_id];
+      printf("| %4zu | %5zu |", config_id + 1, configurations.size());
+      for (const auto& parameter : settings.parameters) {
+        printf("%5zu", configuration.at(parameter.first));
+      }
+      printf(" |");
+
+      // Sets the input
+      for (const auto id : settings.inputs) {
+        device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
+      }
+
+      // Sets the thread configuration
+      const auto global = SetThreadConfiguration(configuration, settings.global_size,
+                                                 settings.mul_global, settings.div_global);
+      const auto local = SetThreadConfiguration(configuration, settings.local_size,
+                                                settings.mul_local, settings.div_local);
+
+      // Sets the parameters for this configuration
+      auto kernel_source = std::string{""};
+      for (const auto &parameter : configuration) {
+        kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n";
+      }
+      kernel_source += settings.sources;
+
+      // Compiles the kernel
+      const auto start_time = std::chrono::steady_clock::now();
+      auto compiler_options = std::vector<std::string>();
+      const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name,
+                                             device, context, compiler_options, true);
+      auto kernel = Kernel(program, settings.kernel_name);
+      const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+      const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+      printf("   %sOK%s  %5.0lf ms |", kPrintSuccess.c_str(), kPrintEnd.c_str(), timing);
+
+      // Runs the kernel
+      C::SetArguments(kernel, args, device_buffers);
+      const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local);
+
+      // Kernel run was not successful
+      if (time_ms == -1.0) {
+        printf("      - |");
+        printf("   %sinvalid config.%s |", kPrintError.c_str(), kPrintEnd.c_str());
+        printf(" <-- skipping\n");
+        continue;
+      }
+
+      // Compares the results
+      auto l2_error = 0.0;
+      for (const auto id : settings.outputs) {
+        device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]);
+        for (auto index = size_t{0}; index<buffer_sizes[id]; ++index) {
+          const auto diff = SquaredDifference(result_buffers[id][index], reference_buffers[id][index]);
+          l2_error += diff;
+        }
+        l2_error /= static_cast<double>(buffer_sizes[id]);
+        if (std::isnan(l2_error) || l2_error > max_l2_norm) {
+          printf("      - |");
+          printf(" %sL2 error %8.2e%s |", kPrintError.c_str(), l2_error, kPrintEnd.c_str());
+          throw std::runtime_error("L2 error too large");
+        }
+      }
+
+      // All was OK
+      configuration["PRECISION"] = static_cast<size_t>(args.precision);
+      results.push_back(TuningResult{settings.kernel_name, time_ms, configuration});
+      printf(" %6.1lf |", settings.metric_amount / (time_ms * 1.0e6));
+      printf("     %sresults match%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str());
+    }
+    catch (const CLCudaAPIBuildError &e) {
+      const auto status_code = DispatchExceptionCatchAll(true);
+      printf("  %scompilation error: %5d%s     |",
+             kPrintError.c_str(), static_cast<int>(status_code), kPrintEnd.c_str());
+      printf("      - |                 - | <-- skipping\n");
+    }
+    catch (...) {
+      const auto status_code = DispatchExceptionCatchAll(true);
+      if (status_code != StatusCode::kUnknownError) {
+        printf("   %serror code %d%s |",
+               kPrintError.c_str(), static_cast<int>(status_code), kPrintEnd.c_str());
+      }
+      printf(" <-- skipping\n");
+    }
+  }
+
+  // Completed the tuning process
+  print_separator(settings.parameters.size());
+  printf("\n");
+  if (results.size() == 0) { return; }
 
-  // Prints the results to screen
-  auto time_ms = tuner.PrintToScreen();
-  tuner.PrintFormatted();
+  // Computes the best results
+  auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
+  const auto best_configuration = std::min_element(results.begin(), results.end(), comparison);
+  const auto best_time_ms = best_configuration->score;
+  if (best_time_ms == 0.0) { return; }
 
   // Also prints the performance of the best-case in terms of GB/s or GFLOPS
-  if (time_ms != 0.0) {
-    printf("[ -------> ] %.2lf ms", time_ms);
-    printf(" or %.1lf %s\n", settings.metric_amount/(time_ms*1.0e6), settings.performance_unit.c_str());
+  printf("\n");
+  printf("* Found best result %.2lf ms", best_time_ms);
+  printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6),
+         settings.performance_unit.c_str());
+  printf("* Best parameters: ");
+  auto best_string = std::string{""};
+  auto i = size_t{0};
+  for (const auto config : best_configuration->config) {
+    best_string += "" + config.first + "=" + ToString(config.second);
+    if (i < best_configuration->config.size() - 1) { best_string += " "; }
+    ++i;
   }
+  printf("%s\n\n", best_string.c_str());
 
   // Outputs the results as JSON to disk, including some meta-data
   auto precision_string = std::to_string(static_cast<size_t>(args.precision));
   auto metadata = std::vector<std::pair<std::string,std::string>>{
     {"kernel_family", settings.kernel_family},
     {"precision", precision_string},
-    {"clblast_device_type", device_type},
-    {"clblast_device_vendor", device_vendor},
-    {"clblast_device_architecture", device_architecture},
-    {"clblast_device_name", device_name}
+    {"best_kernel", best_configuration->name},
+    {"best_time", ToString(best_configuration->score)},
+    {"best_parameters", best_string}
   };
   for (auto &o: defaults.options) {
-    if (o == kArgM)     { metadata.push_back({"arg_m", std::to_string(args.m)}); }
-    if (o == kArgN)     { metadata.push_back({"arg_n", std::to_string(args.n)}); }
-    if (o == kArgK)     { metadata.push_back({"arg_k", std::to_string(args.k)}); }
+    if (o == kArgM)     { metadata.push_back({"arg_m", ToString(args.m)}); }
+    if (o == kArgN)     { metadata.push_back({"arg_n", ToString(args.n)}); }
+    if (o == kArgK)     { metadata.push_back({"arg_k", ToString(args.k)}); }
     if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
     if (o == kArgBeta)  { metadata.push_back({"arg_beta", ToString(args.beta)}); }
     if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); }
   }
-  tuner.PrintJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json", metadata);
- 
+  PrintTimingsToFileAsJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json",
+                           device, platform, metadata, results);
+
+  printf("* Completed tuning process\n");
+  printf("\n");
 }
 
 // =================================================================================================
 } // namespace clblast
 
-// CLBLAST_TUNING_H_
+// CLBLAST_TUNING_TUNING_H_
 #endif
diff --git a/src/utilities/clblast_exceptions.cpp b/src/utilities/clblast_exceptions.cpp
index 32526215..25e5f4be 100644
--- a/src/utilities/clblast_exceptions.cpp
+++ b/src/utilities/clblast_exceptions.cpp
@@ -45,7 +45,7 @@ RuntimeErrorCode::RuntimeErrorCode(StatusCode status, const std::string &subreas
 
 // =================================================================================================
 
-StatusCode DispatchException()
+StatusCode DispatchException(const bool silent)
 {
   const char *message = nullptr;
   StatusCode status;
@@ -66,12 +66,41 @@ StatusCode DispatchException()
     status = StatusCode::kUnknownError;
   }
 
-  if (message) {
+  if (message && !silent) {
     fprintf(stderr, "CLBlast: %s\n", message);
   }
   return status;
 }
 
+StatusCode DispatchExceptionCatchAll(const bool silent)
+{
+  const char *message = nullptr;
+  StatusCode status;
+
+  try {
+    throw;
+  } catch (BLASError &e) {
+    // no message is printed for invalid argument errors
+    status = e.status();
+  } catch (CLCudaAPIError &e) {
+    message = e.what();
+    status = static_cast<StatusCode>(e.status());
+  } catch (RuntimeErrorCode &e) {
+    message = e.what();
+    status = e.status();
+  } catch (Error<std::runtime_error> &e) {
+    message = e.what();
+    status = StatusCode::kUnknownError;
+  } catch (...) {
+    message = "unknown exception type";
+    status = StatusCode::kUnknownError;
+  }
+
+  if (message && !silent) {
+    fprintf(stderr, "CLBlast: %s\n", message);
+  }
+  return status;
+}
 // =================================================================================================
 
 StatusCode DispatchExceptionForC()
diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp
index a790be9c..9bd38187 100644
--- a/src/utilities/clblast_exceptions.hpp
+++ b/src/utilities/clblast_exceptions.hpp
@@ -37,7 +37,8 @@ class RuntimeErrorCode : public ErrorCode<RuntimeError, StatusCode> {
 // =================================================================================================
 
 // Handles (most of the) runtime exceptions and converts them to StatusCode
-StatusCode DispatchException();
+StatusCode DispatchException(const bool silent = false);
+StatusCode DispatchExceptionCatchAll(const bool silent = false);
 
 // Handles remaining exceptions and converts them to StatusCode::kUnhandledError
 StatusCode DispatchExceptionForC();
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
new file mode 100644
index 00000000..2a55506e
--- /dev/null
+++ b/src/utilities/compile.cpp
@@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the kernel compilation functions (see the header for more information).
+//
+// =================================================================================================
+
+#include <vector>
+#include <chrono>
+
+#include "routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+                          const std::string &routine_name,
+                          const Device& device, const Context& context,
+                          std::vector<std::string>& options, const bool silent) {
+  auto header_string = std::string{""};
+
+  header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
+
+  // Adds the name of the routine as a define
+  header_string += "#define ROUTINE_" + routine_name + "\n";
+
+  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
+  // which it is known to work with all OpenCL platforms.
+  if (device.IsNVIDIA() || device.IsARM()) {
+    header_string += "#define USE_INLINE_KEYWORD 1\n";
+  }
+
+  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
+  // performance, but might result in a reduced accuracy.
+  if (device.IsAMD() && device.IsGPU()) {
+    header_string += "#define USE_CL_MAD 1\n";
+  }
+
+  // For specific devices, use staggered/shuffled workgroup indices.
+  if (device.IsAMD() && device.IsGPU()) {
+    header_string += "#define USE_STAGGERED_INDICES 1\n";
+  }
+
+  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+  // performance through better cache behaviour
+  if (device.IsARM() && device.IsGPU()) {
+    header_string += "#define GLOBAL_MEM_FENCE 1\n";
+  }
+
+  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
+  #ifdef CUDA_API
+    source_string +=
+      #include "kernels/opencl_to_cuda.h"
+    ;
+  #endif
+
+  // Loads the common header (typedefs and defines and such)
+  header_string +=
+    #include "kernels/common.opencl"
+  ;
+
+  // Prints details of the routine to compile in case of debugging in verbose mode
+  #ifdef VERBOSE
+    printf("[DEBUG] Compiling routine '%s-%s'\n",
+           routine_name.c_str(), ToString(precision).c_str());
+    const auto start_time = std::chrono::steady_clock::now();
+  #endif
+
+  // Compiles the kernel
+  auto program = Program(context, header_string + source_string);
+  try {
+    program.Build(device, options);
+  } catch (const CLCudaAPIBuildError &e) {
+    if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) {
+      fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
+              program.GetBuildInfo(device).c_str());
+    }
+    throw;
+  }
+
+  // Prints the elapsed compilation time in case of debugging in verbose mode
+  #ifdef VERBOSE
+    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
+  #endif
+
+  return program;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp
new file mode 100644
index 00000000..0315d70c
--- /dev/null
+++ b/src/utilities/compile.hpp
@@ -0,0 +1,36 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the CLBlast way to compile a kernel from source, used for the library and for
+// the auto-tuners.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_UTILITIES_COMPILE_H_
+#define CLBLAST_UTILITIES_COMPILE_H_
+
+#include <string>
+#include <vector>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+                          const std::string &routine_name,
+                          const Device& device, const Context& context,
+                          std::vector<std::string>& options, const bool silent = false);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_UTILITIES_COMPILE_H_
+#endif
diff --git a/src/utilities/timing.cpp b/src/utilities/timing.cpp
new file mode 100644
index 00000000..af6a8ff2
--- /dev/null
+++ b/src/utilities/timing.cpp
@@ -0,0 +1,79 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides helper functions for time measurement and such.
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <exception>
+
+#include "utilities/timing.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                      std::vector<size_t> global, const std::vector<size_t> &local) {
+  auto event = Event();
+
+  if (!local.empty()) {
+    // Tests for validity of the local thread sizes
+    if (local.size() > device.MaxWorkItemDimensions()) {
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
+    }
+    const auto max_work_item_sizes = device.MaxWorkItemSizes();
+    for (auto i=size_t{0}; i<local.size(); ++i) {
+      if (local[i] > max_work_item_sizes[i]) {
+        throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
+      }
+    }
+    auto local_size = size_t{1};
+    for (auto &item: local) { local_size *= item; }
+    if (local_size > device.MaxWorkGroupSize()) {
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
+    }
+
+    // Make sure the global thread sizes are at least equal to the local sizes
+    for (auto i=size_t{0}; i<global.size(); ++i) {
+      if (global[i] < local[i]) { global[i] = local[i]; }
+    }
+  }
+
+  // Tests for local memory usage
+  const auto local_mem_usage = kernel.LocalMemUsage(device);
+  if (!device.IsLocalMemoryValid(local_mem_usage)) {
+    throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
+  }
+
+  // Times the kernel
+  const auto run_kernel_func = [&]() {
+      kernel.Launch(queue, global, local, event.pointer());
+      event.WaitForCompletion();
+      queue.Finish();
+  };
+  return TimeFunction(num_runs, run_kernel_func);
+}
+
+double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                  std::vector<size_t> global, const std::vector<size_t> &local) {
+  try {
+    const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local);
+    printf(" %9.2lf ms |", time_ms);
+    return time_ms;
+  }
+  catch (...) {
+    const auto status_code = DispatchExceptionCatchAll(true);
+    printf("  error %-5d |", static_cast<int>(status_code));
+    return -1.0; // invalid
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index bfad6147..a66aba4b 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -40,6 +40,14 @@ double TimeFunction(const size_t num_runs, F const &function) {
 
 // =================================================================================================
 
+double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                      std::vector<size_t> global, const std::vector<size_t> &local);
+
+double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                  std::vector<size_t> global, const std::vector<size_t> &local);
+
+// =================================================================================================
+
 using Timing = std::pair<size_t, double>;
 
 template <typename T, typename F>
@@ -47,76 +55,27 @@ std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t
                                 const size_t num_runs, const Queue& queue,
                                 const std::vector<Buffer<T>>& buffers, F const &routine) {
   auto timings = std::vector<Timing>();
+  printf("|  value |         time |\n");
+  printf("x--------x--------------x\n");
   for (auto value = from; value < to; value += step) {
-    printf("[ RUN      ] Running with value %zu\n", value);
+    printf("| %6zu |", value);
     try {
       const auto FunctionToTune = [&]() { routine(value, queue, buffers); };
       const auto time_ms = TimeFunction(num_runs, FunctionToTune);
-      printf("[       OK ] Took %.2lf ms\n", time_ms);
+      printf(" %9.2lf ms |\n", time_ms);
       timings.push_back({value, time_ms});
     }
     catch (...) {
-      printf("[    ERROR ] Exception caught\n");
+      const auto status_code = DispatchExceptionCatchAll(true);
+      printf("  error %-5d |\n", static_cast<int>(status_code));
       timings.push_back({value, -1.0}); // invalid
     }
   }
+  printf("x--------x--------------x\n");
   return timings;
 }
 
 // =================================================================================================
-
-using TuningParameter = std::pair<std::string, size_t>;
-using TuningParameters = std::vector<TuningParameter>;
-struct TuningResult { std::string name; double score; TuningParameters parameters; };
-
-void PrintTimingsToFileAsJSON(const std::string &filename,
-                              const Device& device, const Platform& platform,
-                              const std::vector<std::pair<std::string,std::string>> &metadata,
-                              const std::vector<TuningResult>& tuning_results) {
-  printf("[  STATUS  ] Writing results to '%s'\n", filename.c_str());
-  auto file = fopen(filename.c_str(), "w");
-  fprintf(file, "{\n");
-  for (auto &datum: metadata) {
-    fprintf(file, "  \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
-  }
-  fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
-  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
-  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
-  fprintf(file, "  \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
-  fprintf(file, "  \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
-  fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
-  fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
-  fprintf(file, "  \"results\": [\n");
-
-  // Loops over all results
-  auto num_results = tuning_results.size();
-  for (auto r = size_t{0}; r < num_results; ++r) {
-    auto result = tuning_results[r];
-    fprintf(file, "    {\n");
-    fprintf(file, "      \"kernel\": \"%s\",\n", result.name.c_str());
-    fprintf(file, "      \"time\": %.3lf,\n", result.score);
-
-    // Loops over all the parameters for this result
-    fprintf(file, "      \"parameters\": {");
-    auto num_configs = result.parameters.size();
-    for (auto p=size_t{0}; p<num_configs; ++p) {
-      auto config = result.parameters[p];
-      fprintf(file, "\"%s\": %zu", config.first.c_str(), config.second);
-      if (p < num_configs-1) { fprintf(file, ","); }
-    }
-    fprintf(file, "}\n");
-
-    // The footer
-    fprintf(file, "    }");
-    if (r < num_results - 1) { fprintf(file, ","); }
-    fprintf(file, "\n");
-  }
-  fprintf(file, "  ]\n");
-  fprintf(file, "}\n");
-  fclose(file);
-}
-
-// =================================================================================================
 } // namespace clblast
 
 // CLBLAST_TIMING_H_
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index f2574104..1546fbf5 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -397,6 +397,37 @@ template <> bool PrecisionSupported<half>(const Device &device) { return device.
 
 // =================================================================================================
 
+// Retrieves the squared difference, used for example for computing the L2 error
+template <typename T>
+double SquaredDifference(const T val1, const T val2) {
+  const auto difference = (val1 - val2);
+  return static_cast<double>(difference * difference);
+}
+
+// Compiles the default case for standard data-types
+template double SquaredDifference<float>(const float, const float);
+template double SquaredDifference<double>(const double, const double);
+
+// Specialisations for non-standard data-types
+template <>
+double SquaredDifference(const float2 val1, const float2 val2) {
+  const auto real = SquaredDifference(val1.real(), val2.real());
+  const auto imag = SquaredDifference(val1.imag(), val2.imag());
+  return real + imag;
+}
+template <>
+double SquaredDifference(const double2 val1, const double2 val2) {
+  const auto real = SquaredDifference(val1.real(), val2.real());
+  const auto imag = SquaredDifference(val1.imag(), val2.imag());
+  return real + imag;
+}
+template <>
+double SquaredDifference(const half val1, const half val2) {
+  return SquaredDifference(HalfToFloat(val1), HalfToFloat(val2));
+}
+
+// =================================================================================================
+
 // High-level info
 std::string GetDeviceType(const Device& device) {
   return device.Type();
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index f56226be..e26721b3 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -98,6 +98,7 @@ constexpr auto kArgDilationW = "dilationw";
 // The tuner-specific arguments in string form
 constexpr auto kArgFraction = "fraction";
 constexpr auto kArgHeuristicSelection = "heuristic";
+constexpr auto kArgMaxL2Norm = "max_l2_norm";
 // PSO tuner-specific arguments in string form
 constexpr auto kArgPsoSwarmSize = "pso_swarm_size";
 constexpr auto kArgPsoInfGlobal = "pso_inf_global";
@@ -323,6 +324,12 @@ bool PrecisionSupported(const Device &device);
 
 // =================================================================================================
 
+// Retrieves the squared difference, used for example for computing the L2 error
+template <typename T>
+double SquaredDifference(const T val1, const T val2);
+
+// =================================================================================================
+
 // Device information in a specific CLBlast form
 std::string GetDeviceType(const Device& device);
 std::string GetDeviceVendor(const Device& device);
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-11-19 20:05:15 +0100
committer	GitHub <noreply@github.com>	2017-11-19 20:05:15 +0100
commit	da76d7ab81555452a1049eb1a6d130073427067d (patch)
tree	92439d8bee44c34d63f288a73bdc372ba84dc42b /src
parent	c41d219ea42087c1b8d933b733b381005123cb91 (diff)
parent	defad3d1a249dd5f8c011cf28cc3c888d710d56a (diff)