Merge pull request #216 from CNugteren/integrated_tuner

Integrated tuner
author: Cedric Nugteren <web@cedricnugteren.nl> 2017-11-19 20:05:15 +0100
committer: GitHub <noreply@github.com> 2017-11-19 20:05:15 +0100
commit: da76d7ab81555452a1049eb1a6d130073427067d (patch)
tree: 92439d8bee44c34d63f288a73bdc372ba84dc42b
parent: c41d219ea42087c1b8d933b733b381005123cb91 (diff)
parent: defad3d1a249dd5f8c011cf28cc3c888d710d56a (diff)
36 files changed, 1126 insertions, 676 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 200fdc53..95508951 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,6 @@
 
 Development (next version)
+- Re-designed and integrated the auto-tuner, no more dependency on CLTune
 - Added tuned parameters for various devices (see README)
 
 Version 1.2.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cada61ab..139e230e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ set(clblast_VERSION_PATCH 0)
 # Options and their default values
 option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON)
 option(SAMPLES "Enable compilation of the examples" OFF)
-option(TUNERS "Enable compilation of the tuners" OFF)
+option(TUNERS "Enable compilation of the tuners" ON)
 option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
 option(TESTS "Enable compilation of the correctness tests" OFF)
 option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
@@ -156,15 +156,6 @@ elseif(CUDA)
   link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
 endif()
 
-# Locates the CLTune library in case the tuners need to be compiled. "FindCLTune.cmake" is included.
-if(TUNERS)
-  find_package(CLTune)
-  if(NOT CLTUNE_FOUND)
-    message(STATUS "Could NOT find CLTune, disabling the compilation of the tuners")
-    set(TUNERS OFF)
-  endif()
-endif()
-
 # Don't search for system libraries when cross-compiling
 if(${CMAKE_SYSTEM_NAME} STREQUAL Android)
   if(TESTS)
@@ -233,7 +224,9 @@ endif()
 set(SOURCES
   src/database/database.cpp
   src/routines/common.cpp
+  src/utilities/compile.cpp
   src/utilities/clblast_exceptions.cpp
+  src/utilities/timing.cpp
   src/utilities/utilities.cpp
   src/api_common.cpp
   src/cache.cpp
@@ -252,6 +245,7 @@ set(HEADERS  # such that they can be discovered by IDEs such as CLion and Visual
   src/routines/common.hpp
   src/routines/routines.hpp
   src/utilities/buffer_test.hpp
+  src/utilities/compile.hpp
   src/utilities/clblast_exceptions.hpp
   src/utilities/device_mapping.hpp
   src/utilities/msvc.hpp
@@ -373,27 +367,50 @@ endif()
 
 # ==================================================================================================
 
-# This section contains all the code related to the tuners. These tuners require the presence of
-# the CLTune library (not included as part of the source).
+# This section contains all the code related to the tuners
 if(TUNERS)
 
-  # Visual Studio requires the sources of non-exported objects/libraries
-  set(TUNERS_COMMON src/tuning/tuning.hpp)
+  set(TUNERS_COMMON
+      src/utilities/compile.cpp
+      src/utilities/clblast_exceptions.cpp
+      src/utilities/timing.cpp
+      src/utilities/utilities.cpp
+      src/tuning/configurations.cpp
+      src/tuning/tuning.cpp)
+  set(TUNERS_HEADERS  # such that they can be discovered by IDEs such as CLion and Visual Studio
+      src/utilities/compile.hpp
+      src/utilities/clblast_exceptions.hpp
+      src/utilities/timing.hpp
+      src/utilities/utilities.hpp
+      src/tuning/configurations.hpp
+      src/tuning/tuning.hpp)
+  set(TUNERS_COMMON ${TUNERS_COMMON} ${TUNERS_HEADERS})
+
+  # Creates a library with common sources for all tuners
   if(MSVC)
-    set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
+    # Visual Studio requires the sources of non-exported objects/libraries
+  else()
+    # Creates the common performance-tests objects (requires CMake 2.8.8)
+    add_library(tuners_common_library OBJECT ${TUNERS_COMMON})
+
+    # Adds CLBlast's interface include paths because we can't link to CLBlast here
+    target_include_directories(tuners_common_library PRIVATE
+                               $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
+                               ${clblast_SOURCE_DIR} ${API_INCLUDE_DIRS})
+    set(TUNERS_COMMON $<TARGET_OBJECTS:tuners_common_library>)
   endif()
 
   # Adds tuning executables
   foreach(KERNEL ${KERNELS})
     add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
-    target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
-    target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
+    target_link_libraries(clblast_tuner_${KERNEL} ${API_LIBRARIES})
+    target_include_directories(clblast_tuner_${KERNEL} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
   foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
     add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
-    target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES})
-    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC ${CLTUNE_INCLUDE_DIRS})
+    target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
+    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
     install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
   endforeach()
 
diff --git a/README.md b/README.md
index 5f4b3d15..6c27af51 100644
--- a/README.md
+++ b/README.md
@@ -180,8 +180,6 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
 
     cmake -DTUNERS=ON ..
 
-Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.6.0 or higher).
-
 Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables.
 
 The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python (2.7 or 3.x) script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
@@ -416,7 +414,7 @@ More information
 Further information on CLBlast is available through the following links:
 
 * A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf).
-* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (May 2017). For CLTune, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
+* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (May 2017). For CLTune, the inspiration for the included auto-tuner, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
 
 
 Support us
diff --git a/cmake/Modules/FindCLTune.cmake b/cmake/Modules/FindCLTune.cmake
deleted file mode 100644
index 3a37576a..00000000
--- a/cmake/Modules/FindCLTune.cmake
+++ /dev/null
@@ -1,68 +0,0 @@
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-# width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# ==================================================================================================
-#
-# Defines the following variables:
-#   CLTUNE_FOUND          Boolean holding whether or not the CLTune library was found
-#   CLTUNE_INCLUDE_DIRS   The CLTune include directory
-#   CLTUNE_LIBRARIES      The CLTune library
-#
-# In case CLTune is not installed in the default directory, set the CLTUNE_ROOT variable to point to
-# the root of CLTune, such that 'cltune.h' can be found in $CLTUNE_ROOT/include. This can either be
-# done using an environmental variable (e.g. export CLTUNE_ROOT=/path/to/cltune) or using a CMake
-# variable (e.g. cmake -DCLTUNE_ROOT=/path/to/cltune ..).
-#
-# ==================================================================================================
-
-# Sets the possible install locations
-set(CLTUNE_HINTS
-  ${CLTUNE_ROOT}
-  $ENV{CLTUNE_ROOT}
-)
-set(CLTUNE_PATHS
-  /usr
-  /usr/local
-)
-
-# Finds the include directories
-find_path(CLTUNE_INCLUDE_DIRS
-  NAMES cltune.h
-  HINTS ${CLTUNE_HINTS}
-  PATH_SUFFIXES include inc include/x86_64 include/x64
-  PATHS ${CLTUNE_PATHS}
-  DOC "CLTune include header cltune.h"
-)
-mark_as_advanced(CLTUNE_INCLUDE_DIRS)
-
-# Finds the library
-find_library(CLTUNE_LIBRARIES
-  NAMES cltune
-  HINTS ${CLTUNE_HINTS}
-  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32
-  PATHS ${CLTUNE_PATHS}
-  DOC "CLTune library"
-)
-mark_as_advanced(CLTUNE_LIBRARIES)
-
-# ==================================================================================================
-
-# Notification messages
-if(NOT CLTUNE_INCLUDE_DIRS)
-    message(STATUS "Could NOT find 'cltune.h', install CLTune or set CLTUNE_ROOT")
-endif()
-if(NOT CLTUNE_LIBRARIES)
-    message(STATUS "Could NOT find CLTune library, install it or set CLTUNE_ROOT")
-endif()
-
-# Determines whether or not CLTune was found
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(CLTune DEFAULT_MSG CLTUNE_INCLUDE_DIRS CLTUNE_LIBRARIES)
-
-# ==================================================================================================
diff --git a/scripts/database/database.py b/scripts/database/database.py
index 8f3ccce6..28f6ebf8 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -127,7 +127,7 @@ def main(argv):
     # Removes database entries before continuing
     if cl_args.remove_device is not None:
         print("[database] Removing all results for device '%s'" % cl_args.remove_device)
-        remove_database_entries(database, {"clblast_device": cl_args.remove_device})
+        remove_database_entries(database, {"clblast_device_name": cl_args.remove_device})
         io.save_database(database, database_filename)
 
     # Retrieves the best performing results
diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
index 2b4f734c..1a541fff 100644
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@@ -173,7 +173,6 @@ def print_cpp_database(database, output_dir):
                                 kernels = sorted(set([s["kernel"] for s in device_database]))
                                 for kernel in kernels:
                                     kernel_database = [s for s in device_database if s["kernel"] == kernel]
-
                                     assert len(kernel_database) == 1
                                     results = kernel_database[0]["results"]
 
diff --git a/scripts/database/database/io.py b/scripts/database/database/io.py
index 15a39cc1..29d47591 100644
--- a/scripts/database/database/io.py
+++ b/scripts/database/database/io.py
@@ -83,6 +83,14 @@ def load_tuning_results(filename):
     # Removes the numbering following the kernel family name
     json_data["kernel_family"] = re.sub(r'_\d+', '', json_data["kernel_family"])
 
+    # Removes unnecessary data
+    if json_data["best_kernel"]:
+        del json_data["best_kernel"]
+    if json_data["best_time"]:
+        del json_data["best_time"]
+    if json_data["best_parameters"]:
+        del json_data["best_parameters"]
+
     # Adds the kernel name to the section instead of to the individual results
     assert len(json_data["results"]) > 0
     json_data["kernel"] = json_data["results"][0]["kernel"]
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 82fc44fd..0db64ad9 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -352,6 +352,13 @@ class Device {
            std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV));
   }
 
+  // Retrieves the above extra information (if present)
+  std::string GetExtraInfo() const {
+    if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); }
+    if (HasExtension("cl_nv_device_attribute_query")) { return NVIDIAComputeCapability(); }
+    else { return std::string{""}; }
+  }
+
   // Accessor to the private data-member
   const RawDeviceID& operator()() const { return device_; }
  private:
diff --git a/src/cupp11.hpp b/src/cupp11.hpp
index ec21c5b1..00337ebd 100644
--- a/src/cupp11.hpp
+++ b/src/cupp11.hpp
@@ -326,6 +326,9 @@ public:
   std::string AMDBoardName() const { return ""; }
   std::string NVIDIAComputeCapability() const { return Capabilities(); }
 
+  // Retrieves the above extra information
+  std::string GetExtraInfo() const { return NVIDIAComputeCapability(); }
+
   // Accessor to the private data-member
   const RawDeviceID& operator()() const { return device_; }
 private:
diff --git a/src/routine.cpp b/src/routine.cpp
index 81201eea..93882fbf 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -135,74 +135,21 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
     throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
   }
 
-  // Collects the parameters for this device in the form of defines, and adds the precision
+  // Collects the parameters for this device in the form of defines
   auto source_string = std::string{""};
   for (const auto &kernel_name : kernel_names_) {
     source_string += db_(kernel_name).GetDefines();
   }
-  source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
-
-  // Adds the name of the routine as a define
-  source_string += "#define ROUTINE_"+routine_name_+"\n";
-
-  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
-  // which it is known to work with all OpenCL platforms.
-  if (device_.IsNVIDIA() || device_.IsARM()) {
-    source_string += "#define USE_INLINE_KEYWORD 1\n";
-  }
-
-  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
-  // performance, but might result in a reduced accuracy.
-  if (device_.IsAMD() && device_.IsGPU()) {
-    source_string += "#define USE_CL_MAD 1\n";
-  }
-
-  // For specific devices, use staggered/shuffled workgroup indices.
-  if (device_.IsAMD() && device_.IsGPU()) {
-    source_string += "#define USE_STAGGERED_INDICES 1\n";
-  }
-
-  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
-  // performance through better cache behaviour
-  if (device_.IsARM() && device_.IsGPU()) {
-    source_string += "#define GLOBAL_MEM_FENCE 1\n";
-  }
-
-  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
-  #ifdef CUDA_API
-    source_string +=
-      #include "kernels/opencl_to_cuda.h"
-    ;
-  #endif
-
-  // Loads the common header (typedefs and defines and such)
-  source_string +=
-    #include "kernels/common.opencl"
-  ;
 
   // Adds routine-specific code to the constructed source string
   for (const char *s: source) {
     source_string += s;
   }
 
-  // Prints details of the routine to compile in case of debugging in verbose mode
-  #ifdef VERBOSE
-    printf("[DEBUG] Compiling routine '%s-%s' for device '%s'\n",
-           routine_name_.c_str(), ToString(precision_).c_str(), device_name.c_str());
-    const auto start_time = std::chrono::steady_clock::now();
-  #endif
+  // Completes the source and compiles the kernel
+  program_ = CompileFromSource(source_string, precision_, routine_name_,
+                               device_, context_, options);
 
-  // Compiles the kernel
-  program_ = Program(context_, source_string);
-  try {
-    program_.Build(device_, options);
-  } catch (const CLCudaAPIBuildError &e) {
-    if (program_.StatusIsCompilationWarningOrError(e.status())) {
-      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
-              program_.GetBuildInfo(device_).c_str());
-    }
-    throw;
-  }
 
   // Store the compiled binary and program in the cache
   BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name},
@@ -210,13 +157,6 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
 
   ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info},
                                  Program{ program_ });
-
-  // Prints the elapsed compilation time in case of debugging in verbose mode
-  #ifdef VERBOSE
-    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
-    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
-    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
-  #endif
 }
 
 // =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index bf3b1762..06d001d9 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -20,6 +20,7 @@
 #include <vector>
 
 #include "utilities/utilities.hpp"
+#include "utilities/compile.hpp"
 #include "database/database.hpp"
 
 namespace clblast {
diff --git a/src/tuning/configurations.cpp b/src/tuning/configurations.cpp
new file mode 100644
index 00000000..459d66b1
--- /dev/null
+++ b/src/tuning/configurations.cpp
@@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune).
+// This is only used for the optional tuner binaries and not part of the core of CLBlast.
+//
+// =================================================================================================
+
+#include <vector>
+#include <string>
+
+#include "tuning/configurations.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Finds all configurations. It also applies the user-defined constraints within.
+std::vector<Configuration> SetConfigurations(const std::vector<Parameter> parameters,
+                                             const Constraints& constraints) {
+  auto config = Configuration();
+  auto configurations = std::vector<Configuration>();
+  PopulateConfigurations(parameters, 0, config, configurations, constraints);
+  return configurations;
+}
+
+// Iterates recursively over all permutations of the user-defined parameters
+void PopulateConfigurations(const std::vector<Parameter> &parameters,
+                            const size_t index, const Configuration &config,
+                            std::vector<Configuration> &configuration,
+                            const Constraints& constraints) {
+
+  // End of the chain: all parameters are considered, store the resulting configuration if it is a
+  // valid one according to the constraints
+  if (index == parameters.size()) {
+    if (ValidConfiguration(config, constraints)) {
+      configuration.push_back(config);
+    }
+    return;
+  }
+
+  // This loop iterates over all values of the current parameter and calls this function recursively
+  Parameter parameter = parameters[index];
+  for (auto &value: parameter.second) {
+    auto config_copy = config;
+    config_copy[parameter.first] = value;
+    PopulateConfigurations(parameters, index+1, config_copy, configuration, constraints);
+  }
+}
+
+// Loops over all user-defined constraints to check whether or not the configuration is valid
+bool ValidConfiguration(const Configuration &config,
+                        const Constraints& constraints) {
+
+  // Iterates over all constraints
+  for (auto &constraint: constraints) {
+
+    // Finds the values of the parameters
+    auto values = std::vector<size_t>(constraint.parameters.size());
+    for (auto i=size_t{0}; i<constraint.parameters.size(); ++i) {
+      values[i] = config.at(constraint.parameters[i]);
+    }
+
+    // Checks this constraint for these values
+    if (!constraint.valid_if(values)) {
+      return false;
+    }
+  }
+
+  // Everything was OK: this configuration is valid
+  return true;
+}
+
+// Multiplies and/or dividers a thread configuration (local/global)
+std::vector<size_t> SetThreadConfiguration(const Configuration& config,
+                                           const std::vector<size_t> base,
+                                           const TransformVector& mul_config,
+                                           const TransformVector& div_config) {
+  auto result = base;
+  for (const auto &multipliers: mul_config) {
+    for (auto i = size_t{0}; i < multipliers.size(); ++i) {
+      result[i] *= config.at(multipliers[i]);
+    }
+  }
+  for (const auto &dividers: div_config) {
+    for (auto i = size_t{0}; i < dividers.size(); ++i) {
+      result[i] /= config.at(dividers[i]);
+    }
+  }
+  return result;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/configurations.hpp b/src/tuning/configurations.hpp
new file mode 100644
index 00000000..74679ff6
--- /dev/null
+++ b/src/tuning/configurations.hpp
@@ -0,0 +1,73 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune).
+// This is only used for the optional tuner binaries and not part of the core of CLBlast.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TUNING_CONFIGURATIONS_H_
+#define CLBLAST_TUNING_CONFIGURATIONS_H_
+
+#include <vector>
+#include <string>
+#include <map>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+using Configuration = std::map<std::string, size_t>;
+using Parameter = std::pair<std::string, std::vector<size_t>>;
+using TransformVector = std::vector<std::vector<std::string>>;
+
+// Helper structure holding a constraint on parameters. This constraint consists of a constraint
+// function object and a vector of parameter names represented as strings.
+using ConstraintFunction = std::function<bool(std::vector<size_t>)>;
+struct Constraint {
+  ConstraintFunction valid_if;
+  std::vector<std::string> parameters;
+};
+using Constraints = std::vector<Constraint>;
+
+// =================================================================================================
+
+// Initializes an empty configuration (vector of name/value pairs) and kicks-off the recursive
+// function to find all configurations. It also applies the user-defined constraints within.
+std::vector<Configuration> SetConfigurations(const std::vector<Parameter> parameters,
+                                             const Constraints& constraints);
+
+// Iterates recursively over all permutations of the user-defined parameters. This code creates
+// multiple chains, in which each chain selects a unique combination of values for all parameters.
+// At the end of each chain (when all parameters are considered), the function stores the result
+// into the configuration list.
+void PopulateConfigurations(const std::vector<Parameter> &parameters,
+                            const size_t index, const Configuration &config,
+                            std::vector<Configuration> &configuration,
+                            const Constraints& constraints);
+
+// Loops over all user-defined constraints to check whether or not the configuration is valid.
+// Assumes initially all configurations are valid, then returns false if one of the constraints has
+// not been met. Constraints consist of a user-defined function and a list of parameter names, which
+// are replaced by parameter values in this function.
+bool ValidConfiguration(const Configuration &config,
+                        const Constraints& constraints);
+
+// Processes multipliers and dividers to obtain the final thread configuration
+std::vector<size_t> SetThreadConfiguration(const Configuration& config,
+                                           const std::vector<size_t> base,
+                                           const TransformVector& mul_config,
+                                           const TransformVector& div_config);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TUNING_CONFIGURATIONS_H_
+#endif
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 068c5f1b..462107d3 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels.
+// This file uses the auto-tuner to tune the copy OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TuneCopy {
     settings.kernel_family = "copy";
     settings.kernel_name = "CopyMatrixFast";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/copy_fast.opencl"
     ;
@@ -51,6 +50,10 @@ class TuneCopy {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,20 +81,15 @@ class TuneCopy {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(2, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(3, GetRealArg(args.alpha));
   }
 };
 
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 7102d05d..24557517 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels.
+// This file uses the auto-tuner to tune the pad OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TunePad {
     settings.kernel_family = "pad";
     settings.kernel_name = "CopyPadMatrix";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/copy_pad.opencl"
     ;
@@ -51,6 +50,10 @@ class TunePad {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,28 +81,23 @@ class TunePad {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(0);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.m));
+    kernel.SetArgument(3, 0);
+    kernel.SetArgument(4, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(5, static_cast<int>(args.m));
+    kernel.SetArgument(6, static_cast<int>(args.n));
+    kernel.SetArgument(7, static_cast<int>(args.m));
+    kernel.SetArgument(8, 0);
+    kernel.SetArgument(9, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(10, GetRealArg(args.alpha));
+    kernel.SetArgument(11, 0);
   }
 };
 
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 56726903..1e0d3c7b 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels.
+// This file uses the auto-tuner to tune the transpose OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TuneTranspose {
     settings.kernel_family = "transpose";
     settings.kernel_name = "TransposeMatrixFast";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/transpose_fast.opencl"
     ;
@@ -51,6 +50,10 @@ class TuneTranspose {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,25 +81,15 @@ class TuneTranspose {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"});
-  }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(2, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(3, GetRealArg(args.alpha));
   }
 };
 
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index dc46e903..087f8e67 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels.
+// This file uses the auto-tuner to tune the pad-transpose OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TunePadTranspose {
     settings.kernel_family = "padtranspose";
     settings.kernel_name = "TransposePadMatrix";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/level3.opencl"
 #include "../src/kernels/level3/transpose_pad.opencl"
     ;
@@ -51,6 +50,10 @@ class TunePadTranspose {
     settings.size_a = args.m * args.n;
     settings.size_b = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3};
+    settings.outputs = {3};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -77,33 +80,23 @@ class TunePadTranspose {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"});
-  }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(0);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.m));
+    kernel.SetArgument(3, 0);
+    kernel.SetArgument(4, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(5, static_cast<int>(args.n));
+    kernel.SetArgument(6, static_cast<int>(args.m));
+    kernel.SetArgument(7, static_cast<int>(args.n));
+    kernel.SetArgument(8, 0);
+    kernel.SetArgument(9, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(10, GetRealArg(args.alpha));
+    kernel.SetArgument(11, 0);
   }
 };
 
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index e201949a..d843ea78 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xaxpy OpenCL kernels.
+// This file uses the auto-tuner to tune the xaxpy OpenCL kernels.
 //
 // =================================================================================================
 
@@ -41,7 +41,6 @@ class TuneXaxpy {
     settings.kernel_family = "xaxpy";
     settings.kernel_name = "XaxpyFastest";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level1/level1.opencl"
 #include "../src/kernels/level1/xaxpy.opencl"
     ;
@@ -50,6 +49,10 @@ class TuneXaxpy {
     settings.size_x = args.n;
     settings.size_y = args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1};
+    settings.outputs = {1};
+
     // Sets the base thread configuration
     settings.global_size = {args.n};
     settings.global_size_ref = settings.global_size;
@@ -80,20 +83,15 @@ class TuneXaxpy {
       throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW");
     }
   }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentInput(x_vec);
-    tuner.AddArgumentOutput(y_vec);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.n));
+    kernel.SetArgument(1, GetRealArg(args.alpha));
+    kernel.SetArgument(2, buffers[0]()); // 0 == X vector
+    kernel.SetArgument(3, buffers[1]()); // 1 == Y vector
   }
 };
 
diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp
index fb532680..12350657 100644
--- a/src/tuning/kernels/xdot.cpp
+++ b/src/tuning/kernels/xdot.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xdot OpenCL kernels. Note that the results are
+// This file uses the auto-tuner to tune the xdot OpenCL kernels. Note that the results are
 // not verified, since the result is not final and depends on the WGS2 parameter.
 //
 // =================================================================================================
@@ -42,7 +42,6 @@ class TuneXdot {
     settings.kernel_family = "xdot_"+std::to_string(V);
     settings.kernel_name = (V==1) ? "Xdot" : "XdotEpilogue";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level1/xdot.opencl"
     ;
 
@@ -51,6 +50,10 @@ class TuneXdot {
     settings.size_y = args.n;
     settings.size_temp = args.n; // Worst case
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1, 5};
+    settings.outputs = {}; // no output checking
+
     // Sets the base thread configuration
     settings.global_size = (V==1) ? std::vector<size_t>{2*64} : std::vector<size_t>{1};
     settings.global_size_ref = (V==1) ? std::vector<size_t>{2*64*64} : std::vector<size_t>{64};
@@ -58,8 +61,8 @@ class TuneXdot {
     settings.local_size_ref = {64};
 
     // Transforms the thread configuration based on the parameters
-    settings.mul_local = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}};
-    settings.mul_global = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}};
+    settings.mul_local = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}};
+    settings.mul_global = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}};
 
     // Sets the tuning parameters and their possible values
     settings.parameters = {
@@ -75,31 +78,26 @@ class TuneXdot {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &temp) {
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
     if (V == 1) {
-      tuner.AddArgumentScalar(static_cast<int>(args.n));
-      tuner.AddArgumentInput(x_vec);
-      tuner.AddArgumentScalar(0);
-      tuner.AddArgumentScalar(1);
-      tuner.AddArgumentInput(y_vec);
-      tuner.AddArgumentScalar(0);
-      tuner.AddArgumentScalar(1);
-      tuner.AddArgumentInput(temp); // No output checking for the result - size varies
-      tuner.AddArgumentScalar(static_cast<int>(false));
+      kernel.SetArgument(0, static_cast<int>(args.n));
+      kernel.SetArgument(1, buffers[0]()); // 0 == X vector
+      kernel.SetArgument(2, 0);
+      kernel.SetArgument(3, 1);
+      kernel.SetArgument(4, buffers[1]()); // 1 == Y vector
+      kernel.SetArgument(5, 0);
+      kernel.SetArgument(6, 1);
+      kernel.SetArgument(7, buffers[5]()); // 5 == temp; no output checking - size varies
+      kernel.SetArgument(8, static_cast<int>(false));
     }
     else {
-      tuner.AddArgumentInput(temp);
-      tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere
-      tuner.AddArgumentScalar(0);
+      kernel.SetArgument(0, buffers[5]()); // 5 == temp
+      kernel.SetArgument(1, buffers[0]()); // 0 == X vector; no output checking - size varies
+      kernel.SetArgument(2, 0);
     }
   }
 };
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 6dcdf68b..16e32988 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. There are two variations:
+// This file uses the auto-tuner to tune the xgemm OpenCL kernels. There are two variations:
 // - V==1: This tests some limited set of tuning parameters exhaustively.
 // - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
 //
@@ -38,7 +38,6 @@ class TuneXgemm {
     settings.default_k = 1024;
     settings.default_fraction = (V==1) ? 1.0 : 512.0; // test all or sample randomly
     settings.default_num_runs = 2;
-    settings.default_heuristic = static_cast<size_t>(cltune::SearchMethod::RandomSearch);
     return settings;
   }
 
@@ -50,7 +49,6 @@ class TuneXgemm {
     settings.kernel_family = (V==1) ? "xgemm_1" : "xgemm_2";
     settings.kernel_name = "Xgemm";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/xgemm_part1.opencl"
 #include "../src/kernels/level3/xgemm_part2.opencl"
 #include "../src/kernels/level3/xgemm_part3.opencl"
@@ -61,6 +59,10 @@ class TuneXgemm {
     settings.size_b = args.n * args.k;
     settings.size_c = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3, 4};
+    settings.outputs = {4};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -114,74 +116,51 @@ class TuneXgemm {
     settings.metric_amount = 2 * args.m * args.n * args.k;
     settings.performance_unit = "GFLOPS";
 
-    // Returns which search heuristic to use
-    if (V==1) { settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); }
-    else {
-      // Use full-search to explore all parameter combinations or another strategy to search only a
-      // part of the parameter values. The fraction is set as a command-line argument.
-      if (args.fraction == 1.0 || args.fraction == 0.0) {
-        settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
-      } else {
-        settings.heuristic = args.heuristic_selection;
-      }
-    }
-
     return settings;
   }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints
-  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+  static std::vector<Constraint> SetConstraints() {
+    auto constraints = std::vector<Constraint>();
     auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
     auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
     auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
     // Requirement for unrolling the KWG loop
-    tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"});
+    constraints.push_back({MultipleOfX, {"KWG", "KWI"}});
     // Required for integer MWI and NWI
-    tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"});
+    constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}});
+    constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}});
     // Required for integer MWIA and NWIB
-    tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"});
+    constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}});
+    constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}});
     // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...)
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"});
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"});
+    constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}});
+    constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}});
 
     // Extra constraints for variation 1 to limit the set of options significantly
     if (V==1) {
       auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
-      tuner.AddConstraint(id, IsEqual, {"MDIMC", "MDIMA"});
-      tuner.AddConstraint(id, IsEqual, {"NDIMC", "NDIMB"});
-      tuner.AddConstraint(id, IsEqual, {"SA", "SB"});
+      constraints.push_back({IsEqual, {"MDIMC", "MDIMA"}});
+      constraints.push_back({IsEqual, {"NDIMC", "NDIMB"}});
+      constraints.push_back({IsEqual, {"SA", "SB"}});
     }
-  }
-
-  // Sets the local memory size
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG",
-                                                    "SB", "KWG", "NWG"});
+    return constraints;
   }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.k));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(GetRealArg(args.beta));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentInput(b_mat);
-    tuner.AddArgumentOutput(c_mat);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(0);
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.k));
+    kernel.SetArgument(3, GetRealArg(args.alpha));
+    kernel.SetArgument(4, GetRealArg(args.beta));
+    kernel.SetArgument(5, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(6, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(7, buffers[4]()); // 4 == C matrix
+    kernel.SetArgument(8, 0);
+    kernel.SetArgument(9, 0);
   }
 };
 
diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp
index 619fb37a..60a983b4 100644
--- a/src/tuning/kernels/xgemm_direct.cpp
+++ b/src/tuning/kernels/xgemm_direct.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations:
+// This file uses the auto-tuner to tune the direct xgemm kernels. There are two variations:
 // - V==1: This tests some limited set of tuning parameters exhaustively.
 // - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
 //
@@ -36,9 +36,8 @@ class TuneXgemmDirect {
     settings.default_m = 256;
     settings.default_n = 256;
     settings.default_k = 256;
-    settings.default_fraction = (V==1) ? 1.0 : 32.0; // test all or sample randomly
+    settings.default_fraction = (V==1) ? 1.0 : 64.0; // test all or sample randomly
     settings.default_num_runs = 4;
-    settings.default_heuristic = static_cast<size_t>(cltune::SearchMethod::RandomSearch);
     return settings;
   }
 
@@ -50,7 +49,6 @@ class TuneXgemmDirect {
     settings.kernel_family = (V==1) ? "xgemm_direct_1" : "xgemm_direct_2";
     settings.kernel_name = "XgemmDirectTN";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level3/xgemm_direct_part1.opencl"
 #include "../src/kernels/level3/xgemm_direct_part2.opencl"
 #include "../src/kernels/level3/xgemm_direct_part3.opencl"
@@ -61,6 +59,10 @@ class TuneXgemmDirect {
     settings.size_b = args.n * args.k;
     settings.size_c = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {2, 3, 4};
+    settings.outputs = {4};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -89,7 +91,7 @@ class TuneXgemmDirect {
     }
     else { // a lot more tuning parameters - has to be sampled randomly, too much to test all
       settings.parameters = {
-        {"WGD", {8, 16, 32, 64, 128}},
+        {"WGD", {8, 16, 32, 64}},
         {"MDIMCD", {8, 16, 32}},
         {"NDIMCD", {8, 16, 32}},
         {"MDIMAD", {8, 16, 32}},
@@ -106,79 +108,57 @@ class TuneXgemmDirect {
     settings.metric_amount = 2 * args.m * args.n * args.k;
     settings.performance_unit = "GFLOPS";
 
-    // Returns which search heuristic to use
-    if (V==1) { settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); }
-    else {
-      // Use full-search to explore all parameter combinations or another strategy to search only a
-      // part of the parameter values. The fraction is set as a command-line argument.
-      if (args.fraction == 1.0 || args.fraction == 0.0) {
-        settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
-      } else {
-        settings.heuristic = args.heuristic_selection;
-      }
-    }
-
     return settings;
   }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints
-  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+  static std::vector<Constraint> SetConstraints() {
+    auto constraints = std::vector<Constraint>();
     auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
     auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
     auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
     // Requirement for unrolling the WGD loop
-    tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"});
+    constraints.push_back({MultipleOfX, {"WGD", "KWID"}});
     // Required for integer MWID and NWID
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}});
     // Required for integer MWIAD and NWIBD
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"});
-    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}});
+    constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}});
     // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...)
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"});
-    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"});
+    constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}});
+    constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}});
 
     // Extra constraints for variation 1 to limit the set of options significantly
     if (V==1) {
       auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
-      tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"});
-      tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"});
+      constraints.push_back({IsEqual, {"MDIMCD", "MDIMAD"}});
+      constraints.push_back({IsEqual, {"NDIMCD", "NDIMBD"}});
     }
-  }
-
-  // Sets the local memory size
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"});
+    return constraints;
   }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.k));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(GetRealArg(args.beta));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(0); // a_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld
-    tuner.AddArgumentInput(b_mat);
-    tuner.AddArgumentScalar(0); // b_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld
-    tuner.AddArgumentOutput(c_mat);
-    tuner.AddArgumentScalar(0); // c_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld
-    tuner.AddArgumentScalar(1); // c_do_transpose
-    tuner.AddArgumentScalar(0); // a_conjugate
-    tuner.AddArgumentScalar(0); // b_conjugate
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, static_cast<int>(args.k));
+    kernel.SetArgument(3, GetRealArg(args.alpha));
+    kernel.SetArgument(4, GetRealArg(args.beta));
+    kernel.SetArgument(5, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(6, 0); // a_offset
+    kernel.SetArgument(7, static_cast<int>(args.k)); // a_ld
+    kernel.SetArgument(8, buffers[3]()); // 3 == B matrix
+    kernel.SetArgument(9, 0); // b_offset
+    kernel.SetArgument(10, static_cast<int>(args.n)); // b_ld
+    kernel.SetArgument(11, buffers[4]()); // 4 == C matrix
+    kernel.SetArgument(12, 0); // c_offset
+    kernel.SetArgument(13, static_cast<int>(args.n)); // c_ld
+    kernel.SetArgument(14, 1); // c_do_transpose
+    kernel.SetArgument(15, 0); // a_conjugate
+    kernel.SetArgument(16, 0); // b_conjugate
   }
 };
 
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index e66b15f1..3eadd32b 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned:
+// This file uses the auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned:
 // 1: The full version of the kernel
 // 2: The fast version for non-transposed matrices
 // 3: The fast version for transposed matrices
@@ -45,7 +45,6 @@ class TuneXgemv {
     settings.kernel_family = (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot");
     settings.kernel_name = (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot");
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level2/xgemv.opencl"
 #include "../src/kernels/level2/xgemv_fast.opencl"
     ;
@@ -55,6 +54,10 @@ class TuneXgemv {
     settings.size_y = args.m;
     settings.size_a = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1, 2};
+    settings.outputs = {1};
+
     // Sets the base thread configuration
     settings.global_size = {args.m};
     settings.global_size_ref = settings.global_size;
@@ -63,9 +66,7 @@ class TuneXgemv {
 
     // Transforms the thread configuration based on the parameters
     settings.mul_local = {{"WGS"+std::to_string(V)}};
-    settings.div_global = (V==1 || V==2) ?
-                          TunerSettings::TransformVector{{"WPT"+std::to_string(V)}} :
-                          TunerSettings::TransformVector{};
+    settings.div_global = (V==1 || V==2) ? TransformVector{{"WPT"+std::to_string(V)}} : TransformVector{};
 
     // Sets the tuning parameters and their possible values
     if (V==1) {
@@ -98,53 +99,41 @@ class TuneXgemv {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+  static std::vector<Constraint> SetConstraints() {
+    auto constraints = std::vector<Constraint>();
     if (V==2 || V==3) {
       auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
-      tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
+      constraints.push_back({MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}});
     }
     if (V==3) {
       auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; };
-      tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
-    }
-  }
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    if (V==1 || V==2) {
-      auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
-      tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
-    }
-    else {
-      auto LocalMemorySize = [args] (std::vector<size_t> v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); };
-      tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
+      constraints.push_back({LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}});
     }
+    return constraints;
   }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &) {
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
     auto a_rotated = (V==3) ? 1 : 0;
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentScalar(GetRealArg(args.beta));
-    tuner.AddArgumentScalar(static_cast<int>(a_rotated));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentInput(x_vec);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(1);
-    tuner.AddArgumentOutput(y_vec);
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentScalar(1);
-    tuner.AddArgumentScalar(0); // Conjugate transpose
-    tuner.AddArgumentScalar(0); // Additional parameter
-    tuner.AddArgumentScalar(0); // Banded 'kl'
-    tuner.AddArgumentScalar(0); // Banded 'ku'
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, GetRealArg(args.alpha));
+    kernel.SetArgument(3, GetRealArg(args.beta));
+    kernel.SetArgument(4, a_rotated);
+    kernel.SetArgument(5, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(6, 0);
+    kernel.SetArgument(7, static_cast<int>(args.m));
+    kernel.SetArgument(8, buffers[0]()); // 0 == X vector
+    kernel.SetArgument(9, 0);
+    kernel.SetArgument(10, 1);
+    kernel.SetArgument(11, buffers[1]()); // 1 == Y vector
+    kernel.SetArgument(12, 0);
+    kernel.SetArgument(13, 1);
+    kernel.SetArgument(14, 0); // Conjugate transpose
+    kernel.SetArgument(15, 0); // Additional parameter
+    kernel.SetArgument(16, 0); // Banded 'kl'
+    kernel.SetArgument(17, 0); // Banded 'ku'
   }
 };
 
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index c2eb1d31..745e553f 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels.
+// This file uses the auto-tuner to tune the xger OpenCL kernels.
 //
 // =================================================================================================
 
@@ -42,7 +42,6 @@ class TuneXger {
     settings.kernel_family = "xger";
     settings.kernel_name = "Xger";
     settings.sources =
-#include "../src/kernels/common.opencl"
 #include "../src/kernels/level2/level2.opencl"
 #include "../src/kernels/level2/xger.opencl"
     ;
@@ -52,6 +51,10 @@ class TuneXger {
     settings.size_y = args.n;
     settings.size_a = args.m * args.n;
 
+    // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+    settings.inputs = {0, 1, 2};
+    settings.outputs = {2};
+
     // Sets the base thread configuration
     settings.global_size = {args.m, args.n};
     settings.global_size_ref = settings.global_size;
@@ -78,29 +81,24 @@ class TuneXger {
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+  static std::vector<Constraint> SetConstraints() { return {}; }
 
   // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &x_vec, std::vector<T> &y_vec,
-                           std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(GetRealArg(args.alpha));
-    tuner.AddArgumentInput(x_vec);
-    tuner.AddArgumentScalar(0); // x_offset
-    tuner.AddArgumentScalar(1); // x_increment
-    tuner.AddArgumentInput(y_vec);
-    tuner.AddArgumentScalar(0); // y_offset
-    tuner.AddArgumentScalar(1); // y_increment
-    tuner.AddArgumentOutput(a_mat);
-    tuner.AddArgumentScalar(0); // a_offset
-    tuner.AddArgumentScalar(static_cast<int>(args.m)); // a_ld
-    tuner.AddArgumentScalar(0); // a_is_rowmajor
+  static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+                           std::vector<Buffer<T>>& buffers) {
+    kernel.SetArgument(0, static_cast<int>(args.m));
+    kernel.SetArgument(1, static_cast<int>(args.n));
+    kernel.SetArgument(2, GetRealArg(args.alpha));
+    kernel.SetArgument(3, buffers[0]()); // 0 == X vector
+    kernel.SetArgument(4, 0); // x_offset
+    kernel.SetArgument(5, 1); // x_increment
+    kernel.SetArgument(6, buffers[1]()); // 1 == Y vector
+    kernel.SetArgument(7, 0); // y_offset
+    kernel.SetArgument(8, 1); // y_increment
+    kernel.SetArgument(9, buffers[2]()); // 2 == A matrix
+    kernel.SetArgument(10, 0); // a_offset
+    kernel.SetArgument(11, static_cast<int>(args.m)); // a_ld
+    kernel.SetArgument(12, 0); // a_is_rowmajor
   }
 };
 
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index a880c97e..cd22137a 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -18,7 +18,7 @@
 #include <assert.h>
 
 #include "utilities/utilities.hpp"
-#include "utilities/timing.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -68,7 +68,7 @@ void TuneXgemm(int argc, char* argv[]) {
   const auto platform = Platform(platform_id);
   const auto device = Device(platform, device_id);
   if (!PrecisionSupported<T>(device)) {
-    printf("* Unsupported precision, skipping this tuning run\n\n");
+    printf("* Unsupported precision, skipping this tuning run\n");
     return;
   }
   const auto context = Context(device);
@@ -81,18 +81,18 @@ void TuneXgemm(int argc, char* argv[]) {
   auto buffers = std::vector<Buffer<T>>{a_mat, b_mat, c_mat};
 
   // In-direct version
-  printf("[----------] Testing the in-direct GEMM routine for m=n=k\n");
+  printf("\n* Testing the in-direct GEMM routine for m=n=k\n");
   ForceSelectIndirectFrom<T>(0, device);
   const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
 
   // Direct version
-  printf("[----------] Testing the direct GEMM routine for m=n=k\n");
+  printf("\n* Testing the direct GEMM routine for m=n=k\n");
   ForceSelectIndirectFrom<T>(to * to * to + 1, device);
   const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
 
   // Determining final score and best kernel selection point
   assert(indirect.size() == direct.size());
-  printf("[----------] Collecting results\n");
+  printf("\n* Collecting results\n");
   auto ratios = std::vector<double>(indirect.size());
   for (auto i = size_t{0}; i < indirect.size(); ++i) {
     ratios[i] = indirect[i].second / direct[i].second;
@@ -104,42 +104,55 @@ void TuneXgemm(int argc, char* argv[]) {
     for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); }
     const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones
     const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1);
+    auto tuning_results = Configuration();
+    tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = indirect[i].first;
+    tuning_results["PRECISION"] = static_cast<size_t>(precision);
     scores[i] = TuningResult{
         "gemm_kernel_selection",
         (relative_score * relative_score) * 100 + epsilon,  // squared for proper default computation
-        TuningParameters{
-            TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first},
-            TuningParameter{"PRECISION", static_cast<size_t>(precision)}
-        }
+        tuning_results
     };
   }
 
   // Displaying results
-  printf("[ -------> ]   value indirect   direct    score (lowest means best switching point)\n");
+  printf("|   value |    indirect |      direct |  score   | (lowest score == best switching point)\n");
+  printf("x---------x-------------x-------------x----------x\n");
   for (auto i = size_t{0}; i < indirect.size(); ++i) {
     assert(indirect[i].first == direct[i].first);
     const auto value = indirect[i].first;
     if (indirect[i].second != -1 && direct[i].second != -1) {
       const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6);
       const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6);
-      printf("[ -------> ] %7zu %8.2lf %8.2lf %8.2lf\n",
+      printf("| %7zu | %8.2lf ms | %8.2lf ms | %8.3lf |\n",
              value, gflops_indirect, gflops_direct, scores[i].score);
     }
   }
+  printf("x---------x-------------x-------------x----------x\n");
+  printf("\n");
+
+  // Computes the best switching point
+  auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
+  const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison);
+  const auto best_switching_point = best_configuration->config["XGEMM_MIN_INDIRECT_SIZE"];
+  const auto best_string = "XGEMM_MIN_INDIRECT_SIZE=" + ToString(best_switching_point);
 
   // Outputs the results as JSON to disk, including some meta-data
   const auto precision_string = std::to_string(static_cast<size_t>(precision));
   auto metadata = std::vector<std::pair<std::string,std::string>>{
       {"kernel_family", "gemm_routine"},
+      {"precision", precision_string},
       {"arg_from", ToString(from)},
       {"arg_to", ToString(to)},
       {"arg_step", ToString(step)},
-      {"precision", precision_string},
+      {"best_kernel", best_configuration->name},
+      {"best_time", ToString(best_configuration->score)},
+      {"best_parameters", best_string}
   };
   PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json",
                            device, platform, metadata, scores);
 
-  printf("[  STATUS  ] All done\n");
+  printf("* Completed tuning process\n");
+  printf("\n");
 }
 
 // =================================================================================================
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
new file mode 100644
index 00000000..0af17a6f
--- /dev/null
+++ b/src/tuning/tuning.cpp
@@ -0,0 +1,88 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the generic CLBlast auto-tuner (inspired by CLTune). This is only used for
+//  the optional and stand-alone tuner binaries and not part of the core of CLBlast.
+//
+// =================================================================================================
+
+#include <vector>
+#include <string>
+#include <random>
+#include <utility>
+#include <algorithm>
+#include <iostream>
+
+#include "utilities/utilities.hpp"
+#include "tuning/tuning.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+void PrintTimingsToFileAsJSON(const std::string &filename,
+                              const Device& device, const Platform& platform,
+                              const std::vector<std::pair<std::string,std::string>> &metadata,
+                              const std::vector<TuningResult>& tuning_results) {
+  auto num_results = tuning_results.size();
+  printf("* Writing a total of %zu results to '%s'\n", num_results, filename.c_str());
+
+  auto file = fopen(filename.c_str(), "w");
+  fprintf(file, "{\n");
+  for (auto &datum: metadata) {
+    fprintf(file, "  \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
+  }
+  fprintf(file, "  \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
+  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
+  fprintf(file, "  \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
+  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
+  fprintf(file, "  \"device\": \"%s\",\n", device.Name().c_str());
+  fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
+  fprintf(file, "  \"device_vendor\": \"%s\",\n", platform.Vendor().c_str());
+  fprintf(file, "  \"device_type\": \"%s\",\n", device.Type().c_str());
+  fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
+  fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
+  fprintf(file, "  \"device_extra_info\": \"%s\",\n", device.GetExtraInfo().c_str());
+  fprintf(file, "  \"results\": [\n");
+
+  // Loops over all results
+  for (auto r = size_t{0}; r < num_results; ++r) {
+    auto result = tuning_results[r];
+    fprintf(file, "    {\n");
+    fprintf(file, "      \"kernel\": \"%s\",\n", result.name.c_str());
+    fprintf(file, "      \"time\": %.3lf,\n", result.score);
+
+    // Loops over all the parameters for this result
+    fprintf(file, "      \"parameters\": {");
+    auto num_configs = result.config.size();
+    auto p = size_t{0};
+    for (const auto parameter : result.config) {
+      fprintf(file, "\"%s\": %zu", parameter.first.c_str(), parameter.second);
+      if (p < num_configs -1 ) { fprintf(file, ","); }
+      ++p;
+    }
+    fprintf(file, "}\n");
+
+    // The footer
+    fprintf(file, "    }");
+    if (r < num_results - 1) { fprintf(file, ","); }
+    fprintf(file, "\n");
+  }
+  fprintf(file, "  ]\n");
+  fprintf(file, "}\n");
+  fclose(file);
+}
+
+void print_separator(const size_t parameters_size) {
+  printf("x------x-------x");
+  for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); }
+  printf("-x----------------x--------------x--------x-------------------x\n");
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index bc9c0e03..2c7f6a0b 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -7,26 +7,45 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the interface to the CLTune auto-tuner. This is only used for the optional
-// and stand-alone tuner binaries and not part of the core of CLBlast.
+// This file implements the generic CLBlast auto-tuner (inspired by CLTune). This is only used for
+//  the optional and stand-alone tuner binaries and not part of the core of CLBlast.
 //
 // =================================================================================================
 
-#ifndef CLBLAST_TUNING_H_
-#define CLBLAST_TUNING_H_
+#ifndef CLBLAST_TUNING_TUNING_H_
+#define CLBLAST_TUNING_TUNING_H_
 
 #include <vector>
 #include <string>
 #include <random>
 #include <utility>
-
-#include <cltune.h>
+#include <algorithm>
+#include <iostream>
+#include <chrono>
 
 #include "utilities/utilities.hpp"
+#include "utilities/compile.hpp"
+#include "utilities/timing.hpp"
+#include "tuning/configurations.hpp"
 
 namespace clblast {
 // =================================================================================================
 
+// Constants holding start and end strings for terminal-output in colour
+#if defined(_WIN32)
+  const std::string kPrintError = "";
+  const std::string kPrintSuccess = "";
+  const std::string kPrintMessage = "";
+  const std::string kPrintEnd = "";
+#else
+  const std::string kPrintError = "\x1b[31m";
+  const std::string kPrintSuccess = "\x1b[32m";
+  const std::string kPrintMessage = "\x1b[1m";
+  const std::string kPrintEnd = "\x1b[0m";
+#endif
+
+// =================================================================================================
+
 // Structures for the tuners with all the default settings
 struct TunerDefaults {
 
@@ -41,15 +60,7 @@ struct TunerDefaults {
   // Other defaults
   size_t default_batch_count = 1;
   size_t default_num_runs = 10; // run every kernel this many times for averaging
-
-  // Search heuristic defaults
   double default_fraction = 1.0;
-  size_t default_swarm_size_PSO = 8;
-  double default_influence_global_PSO = 0.1;
-  double default_influence_local_PSO = 0.3;
-  double default_influence_random_PSO = 0.6;
-  size_t default_heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
-  double default_max_temp_ann = 1.0;
 };
 
 // Structures for the tuners with the remaining settings
@@ -68,6 +79,10 @@ struct TunerSettings {
   size_t size_c = 1;
   size_t size_temp = 1;
 
+  // Inputs and outputs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+  std::vector<size_t> inputs = {};
+  std::vector<size_t> outputs = {};
+
   // Sets the base thread configuration
   std::vector<size_t> global_size = {};
   std::vector<size_t> global_size_ref = {};
@@ -75,25 +90,32 @@ struct TunerSettings {
   std::vector<size_t> local_size_ref = {};
 
   // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
   TransformVector mul_local = {};
   TransformVector div_local = {};
   TransformVector mul_global = {};
   TransformVector div_global = {};
 
   // Sets the tuning parameters and their possible values
-  std::vector<std::pair<std::string, std::vector<size_t>>> parameters;
+  std::vector<Parameter> parameters;
 
   // Describes how to compute the performance metrics
   size_t metric_amount = 0;
   std::string performance_unit = "N/A";
-
-  // Returns which search heuristic to use
-  size_t heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
 };
 
 // =================================================================================================
 
+struct TuningResult { std::string name; double score; Configuration config; };
+
+void PrintTimingsToFileAsJSON(const std::string &filename,
+                              const Device& device, const Platform& platform,
+                              const std::vector<std::pair<std::string,std::string>> &metadata,
+                              const std::vector<TuningResult>& tuning_results);
+
+void print_separator(const size_t parameters_size);
+
+// =================================================================================================
+
 // Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
 // the results. Used for all types of kernel families. Note that this is a header-only function so
 // that it is automatically compiled for the various kernels (given as the 'C' template argument).
@@ -115,147 +137,266 @@ void Tuner(int argc, char* argv[]) {
     if (o == kArgK)        { args.k        = GetArgument(command_line_args, help, kArgK, defaults.default_k); }
     if (o == kArgAlpha)    { args.alpha    = GetArgument(command_line_args, help, kArgAlpha, GetScalar<T>()); }
     if (o == kArgBeta)     { args.beta     = GetArgument(command_line_args, help, kArgBeta, GetScalar<T>()); }
-    if (o == kArgFraction) { args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction); }
     if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, defaults.default_batch_count); }
-    if (o == kArgHeuristicSelection) {args.heuristic_selection = GetArgument(command_line_args, help, kArgHeuristicSelection, defaults.default_heuristic);  }
-    if (o == kArgPsoSwarmSize)   {args.pso_swarm_size      = GetArgument(command_line_args, help, kArgPsoSwarmSize , defaults.default_swarm_size_PSO);  }
-    if (o == kArgPsoInfGlobal)   {args.pso_inf_global      = GetArgument(command_line_args, help, kArgPsoInfGlobal, defaults.default_influence_global_PSO);  }
-    if (o == kArgPsoInfLocal)    {args.pso_inf_local       = GetArgument(command_line_args, help, kArgPsoInfLocal, defaults.default_influence_local_PSO);  }
-    if (o == kArgPsoInfRandom)   {args.pso_inf_random      = GetArgument(command_line_args, help, kArgPsoInfRandom, defaults.default_influence_random_PSO);  }
-    if (o == kArgAnnMaxTemp)     {args.ann_max_temperature = GetArgument(command_line_args, help, kArgAnnMaxTemp, defaults.default_max_temp_ann); }
   }
-  const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs);
-  fprintf(stdout, "%s\n", help.c_str());
+  args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction);
+  args.num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs);
+  const auto max_l2_norm = GetArgument(command_line_args, help, kArgMaxL2Norm, 1.0e-4);
+  printf("%s\n", help.c_str());
   const TunerSettings settings = C::GetTunerSettings(args);
 
   // Tests validity of the given arguments
   C::TestValidArguments(args);
 
+  // Initializes OpenCL
+  const auto platform = Platform(args.platform_id);
+  const auto device = Device(platform, args.device_id);
+  const auto context = Context(device);
+  auto queue = Queue(context, device);
+
   // Tests for validity of the precision and retrieves properties
-  auto isAMD = false;
-  auto isARM = false;
-  auto isGPU = false;
-  auto device_type = std::string{};
-  auto device_vendor = std::string{};
-  auto device_architecture = std::string{};
-  auto device_name = std::string{};
-  { // In a block such that the platform and the device are destroyed before initializing the tuner
-    const auto platform = Platform(args.platform_id);
-    const auto device = Device(platform, args.device_id);
-    if (!PrecisionSupported<T>(device)) {
-      printf("* Unsupported precision, skipping this tuning run\n\n");
-      return;
-    }
-    isAMD = device.IsAMD();
-    isARM = device.IsARM();
-    isGPU = device.IsGPU();
-    device_type = GetDeviceType(device);
-    device_vendor = GetDeviceVendor(device);
-    device_architecture = GetDeviceArchitecture(device);
-    device_name = GetDeviceName(device);
+  if (!PrecisionSupported<T>(device)) {
+    printf("* Unsupported precision, skipping this tuning run\n\n");
+    return;
   }
+  const auto device_type = GetDeviceType(device);
+  const auto device_vendor = GetDeviceVendor(device);
+  const auto device_architecture = GetDeviceArchitecture(device);
+  const auto device_name = GetDeviceName(device);
 
   // Creates input buffers with random data
-  auto x_vec = std::vector<T>(settings.size_x);
-  auto y_vec = std::vector<T>(settings.size_y);
-  auto a_mat = std::vector<T>(settings.size_a);
-  auto b_mat = std::vector<T>(settings.size_b);
-  auto c_mat = std::vector<T>(settings.size_c);
-  auto temp = std::vector<T>(settings.size_temp);
+  const auto buffer_sizes = std::vector<size_t>{
+      settings.size_x, settings.size_y,
+      settings.size_a, settings.size_b, settings.size_c,
+      settings.size_temp
+  };
   std::mt19937 mt(kSeed);
   std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
-  PopulateVector(x_vec, mt, dist);
-  PopulateVector(y_vec, mt, dist);
-  PopulateVector(a_mat, mt, dist);
-  PopulateVector(b_mat, mt, dist);
-  PopulateVector(c_mat, mt, dist);
-  PopulateVector(temp, mt, dist);
-
-  // Initializes the tuner for the chosen device
-  cltune::Tuner tuner(args.platform_id, args.device_id);
-
-  // Select the search method based on the command-line arguments
-  // If the tuner does not support the selected choice, full search will be returned.
-  auto method = settings.heuristic;
-  if      (method == 1) { tuner.UseRandomSearch(1.0/args.fraction); }
-  else if (method == 2) { tuner.UseAnnealing(1.0/args.fraction, args.ann_max_temperature); }
-  else if (method == 3) { tuner.UsePSO(1.0/args.fraction, args.pso_swarm_size, args.pso_inf_global,
-                                       args.pso_inf_local, args.pso_inf_random); }
-  else                  { tuner.UseFullSearch(); }
-
-  // Set extra settings for specific defines. This mimics src/routine.cc.
-  auto defines = std::string{""};
-  if (isAMD && isGPU) {
-    defines += "#define USE_CL_MAD 1\n";
-    defines += "#define USE_STAGGERED_INDICES 1\n";
+  auto source_buffers = std::vector<std::vector<T>>();
+  auto reference_buffers = std::vector<std::vector<T>>();
+  auto result_buffers = std::vector<std::vector<T>>();
+  auto device_buffers = std::vector<Buffer<T>>();
+  for (const auto size : buffer_sizes) {
+    auto host_buffer = std::vector<T>(size);
+    PopulateVector(host_buffer, mt, dist);
+    source_buffers.push_back(host_buffer);
+    auto reference_buffer = std::vector<T>(size);
+    reference_buffers.push_back(reference_buffer);
+    auto result_buffer = std::vector<T>(size);
+    result_buffers.push_back(result_buffer);
+    auto device_buffer = Buffer<T>(context, size);
+    device_buffers.push_back(device_buffer);
   }
-  if (isARM && isGPU) {
-    defines += "#define GLOBAL_MEM_FENCE 1\n";
-  }
-
-  // Loads the kernel sources and defines the kernel to tune
-  auto sources = defines + settings.sources;
-  auto id = tuner.AddKernelFromString(sources, settings.kernel_name, settings.global_size, settings.local_size);
-  tuner.SetReferenceFromString(sources, settings.kernel_name, settings.global_size_ref, settings.local_size_ref);
 
   // Sets the tunable parameters and their possible values
-  for (const auto &parameter: settings.parameters) {
-    tuner.AddParameter(id, parameter.first, parameter.second);
+  auto configurations = SetConfigurations(settings.parameters, C::SetConstraints());
+  printf("* Found %s%zu configuration(s)%s\n",
+         kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str());
+
+  // Select the search method (full search or a random fraction)
+  if (args.fraction != 0.0 && args.fraction != 1.0) {
+    const auto new_size = static_cast<size_t>(configurations.size() / args.fraction);
+    auto rng = std::default_random_engine{};
+    std::shuffle(std::begin(configurations), std::end(configurations), rng);
+    configurations.resize(new_size);
+    printf("* Exploring a random subset of %s%zu configuration(s)%s\n",
+           kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str());
   }
-  C::SetConstraints(tuner, id);
-  C::SetLocalMemorySize(tuner, id, args);
 
-  // Tests for a specific precision
-  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
-  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+  // Prints information about the parameters
+  printf("* Parameters explored: ");
+  for (const auto& parameter : settings.parameters) { printf("%s ", parameter.first.c_str()); }
+  printf("\n");
+
+  // Prints the header of the table
+  printf("\n");
+  printf("|   ID | total |");
+  for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf("     "); }
+  printf("param |       compiles |         time | %6s |            status |\n", settings.performance_unit.c_str());
+  print_separator(settings.parameters.size());
+
+  // First runs a reference example to compare against
+  try {
+    printf("|  ref |     - |");
+    for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf("     "); }
+    printf("    - |");
 
-  // Modifies the thread-sizes (both global and local) based on the parameters
-  for (auto &parameters: settings.mul_local) { tuner.MulLocalSize(id, parameters); }
-  for (auto &parameters: settings.div_local) { tuner.DivLocalSize(id, parameters); }
-  for (auto &parameters: settings.mul_global) { tuner.MulGlobalSize(id, parameters); }
-  for (auto &parameters: settings.div_global) { tuner.DivGlobalSize(id, parameters); }
 
-  // Sets the function's arguments
-  C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
+    // Sets the input
+    for (const auto id : settings.inputs) {
+      device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
+    }
+
+    // Compiles the kernel
+    auto compiler_options = std::vector<std::string>();
+    const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name,
+                                           device, context, compiler_options);
+    auto kernel = Kernel(program, settings.kernel_name);
+    C::SetArguments(kernel, args, device_buffers);
+    printf("             %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str());
+
+    // Runs the kernel
+    const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device,
+                                    settings.global_size_ref, settings.local_size_ref);
+    printf("      - |");
+    if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); }
+
+    // Saves the result
+    for (const auto id : settings.outputs) {
+      device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]);
+    }
+    printf("      %sreference OK%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str());
+  }
+  catch (...) {
+    const auto status_code = DispatchExceptionCatchAll(true);
+    printf("* Exception caught with status %d while running the reference, aborting\n",
+           static_cast<int>(status_code));
+    return;
+  }
+  print_separator(settings.parameters.size());
 
   // Starts the tuning process
-  tuner.SetNumRuns(num_runs);
-  tuner.Tune();
+  auto results = std::vector<TuningResult>();
+  for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) {
+    try {
+
+      auto configuration = configurations[config_id];
+      printf("| %4zu | %5zu |", config_id + 1, configurations.size());
+      for (const auto& parameter : settings.parameters) {
+        printf("%5zu", configuration.at(parameter.first));
+      }
+      printf(" |");
+
+      // Sets the input
+      for (const auto id : settings.inputs) {
+        device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
+      }
+
+      // Sets the thread configuration
+      const auto global = SetThreadConfiguration(configuration, settings.global_size,
+                                                 settings.mul_global, settings.div_global);
+      const auto local = SetThreadConfiguration(configuration, settings.local_size,
+                                                settings.mul_local, settings.div_local);
+
+      // Sets the parameters for this configuration
+      auto kernel_source = std::string{""};
+      for (const auto &parameter : configuration) {
+        kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n";
+      }
+      kernel_source += settings.sources;
+
+      // Compiles the kernel
+      const auto start_time = std::chrono::steady_clock::now();
+      auto compiler_options = std::vector<std::string>();
+      const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name,
+                                             device, context, compiler_options, true);
+      auto kernel = Kernel(program, settings.kernel_name);
+      const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+      const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+      printf("   %sOK%s  %5.0lf ms |", kPrintSuccess.c_str(), kPrintEnd.c_str(), timing);
+
+      // Runs the kernel
+      C::SetArguments(kernel, args, device_buffers);
+      const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local);
+
+      // Kernel run was not successful
+      if (time_ms == -1.0) {
+        printf("      - |");
+        printf("   %sinvalid config.%s |", kPrintError.c_str(), kPrintEnd.c_str());
+        printf(" <-- skipping\n");
+        continue;
+      }
+
+      // Compares the results
+      auto l2_error = 0.0;
+      for (const auto id : settings.outputs) {
+        device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]);
+        for (auto index = size_t{0}; index<buffer_sizes[id]; ++index) {
+          const auto diff = SquaredDifference(result_buffers[id][index], reference_buffers[id][index]);
+          l2_error += diff;
+        }
+        l2_error /= static_cast<double>(buffer_sizes[id]);
+        if (std::isnan(l2_error) || l2_error > max_l2_norm) {
+          printf("      - |");
+          printf(" %sL2 error %8.2e%s |", kPrintError.c_str(), l2_error, kPrintEnd.c_str());
+          throw std::runtime_error("L2 error too large");
+        }
+      }
+
+      // All was OK
+      configuration["PRECISION"] = static_cast<size_t>(args.precision);
+      results.push_back(TuningResult{settings.kernel_name, time_ms, configuration});
+      printf(" %6.1lf |", settings.metric_amount / (time_ms * 1.0e6));
+      printf("     %sresults match%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str());
+    }
+    catch (const CLCudaAPIBuildError &e) {
+      const auto status_code = DispatchExceptionCatchAll(true);
+      printf("  %scompilation error: %5d%s     |",
+             kPrintError.c_str(), static_cast<int>(status_code), kPrintEnd.c_str());
+      printf("      - |                 - | <-- skipping\n");
+    }
+    catch (...) {
+      const auto status_code = DispatchExceptionCatchAll(true);
+      if (status_code != StatusCode::kUnknownError) {
+        printf("   %serror code %d%s |",
+               kPrintError.c_str(), static_cast<int>(status_code), kPrintEnd.c_str());
+      }
+      printf(" <-- skipping\n");
+    }
+  }
+
+  // Completed the tuning process
+  print_separator(settings.parameters.size());
+  printf("\n");
+  if (results.size() == 0) { return; }
 
-  // Prints the results to screen
-  auto time_ms = tuner.PrintToScreen();
-  tuner.PrintFormatted();
+  // Computes the best results
+  auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
+  const auto best_configuration = std::min_element(results.begin(), results.end(), comparison);
+  const auto best_time_ms = best_configuration->score;
+  if (best_time_ms == 0.0) { return; }
 
   // Also prints the performance of the best-case in terms of GB/s or GFLOPS
-  if (time_ms != 0.0) {
-    printf("[ -------> ] %.2lf ms", time_ms);
-    printf(" or %.1lf %s\n", settings.metric_amount/(time_ms*1.0e6), settings.performance_unit.c_str());
+  printf("\n");
+  printf("* Found best result %.2lf ms", best_time_ms);
+  printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6),
+         settings.performance_unit.c_str());
+  printf("* Best parameters: ");
+  auto best_string = std::string{""};
+  auto i = size_t{0};
+  for (const auto config : best_configuration->config) {
+    best_string += "" + config.first + "=" + ToString(config.second);
+    if (i < best_configuration->config.size() - 1) { best_string += " "; }
+    ++i;
   }
+  printf("%s\n\n", best_string.c_str());
 
   // Outputs the results as JSON to disk, including some meta-data
   auto precision_string = std::to_string(static_cast<size_t>(args.precision));
   auto metadata = std::vector<std::pair<std::string,std::string>>{
     {"kernel_family", settings.kernel_family},
     {"precision", precision_string},
-    {"clblast_device_type", device_type},
-    {"clblast_device_vendor", device_vendor},
-    {"clblast_device_architecture", device_architecture},
-    {"clblast_device_name", device_name}
+    {"best_kernel", best_configuration->name},
+    {"best_time", ToString(best_configuration->score)},
+    {"best_parameters", best_string}
   };
   for (auto &o: defaults.options) {
-    if (o == kArgM)     { metadata.push_back({"arg_m", std::to_string(args.m)}); }
-    if (o == kArgN)     { metadata.push_back({"arg_n", std::to_string(args.n)}); }
-    if (o == kArgK)     { metadata.push_back({"arg_k", std::to_string(args.k)}); }
+    if (o == kArgM)     { metadata.push_back({"arg_m", ToString(args.m)}); }
+    if (o == kArgN)     { metadata.push_back({"arg_n", ToString(args.n)}); }
+    if (o == kArgK)     { metadata.push_back({"arg_k", ToString(args.k)}); }
     if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
     if (o == kArgBeta)  { metadata.push_back({"arg_beta", ToString(args.beta)}); }
     if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); }
   }
-  tuner.PrintJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json", metadata);
- 
+  PrintTimingsToFileAsJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json",
+                           device, platform, metadata, results);
+
+  printf("* Completed tuning process\n");
+  printf("\n");
 }
 
 // =================================================================================================
 } // namespace clblast
 
-// CLBLAST_TUNING_H_
+// CLBLAST_TUNING_TUNING_H_
 #endif
diff --git a/src/utilities/clblast_exceptions.cpp b/src/utilities/clblast_exceptions.cpp
index 32526215..25e5f4be 100644
--- a/src/utilities/clblast_exceptions.cpp
+++ b/src/utilities/clblast_exceptions.cpp
@@ -45,7 +45,7 @@ RuntimeErrorCode::RuntimeErrorCode(StatusCode status, const std::string &subreas
 
 // =================================================================================================
 
-StatusCode DispatchException()
+StatusCode DispatchException(const bool silent)
 {
   const char *message = nullptr;
   StatusCode status;
@@ -66,12 +66,41 @@ StatusCode DispatchException()
     status = StatusCode::kUnknownError;
   }
 
-  if (message) {
+  if (message && !silent) {
     fprintf(stderr, "CLBlast: %s\n", message);
   }
   return status;
 }
 
+StatusCode DispatchExceptionCatchAll(const bool silent)
+{
+  const char *message = nullptr;
+  StatusCode status;
+
+  try {
+    throw;
+  } catch (BLASError &e) {
+    // no message is printed for invalid argument errors
+    status = e.status();
+  } catch (CLCudaAPIError &e) {
+    message = e.what();
+    status = static_cast<StatusCode>(e.status());
+  } catch (RuntimeErrorCode &e) {
+    message = e.what();
+    status = e.status();
+  } catch (Error<std::runtime_error> &e) {
+    message = e.what();
+    status = StatusCode::kUnknownError;
+  } catch (...) {
+    message = "unknown exception type";
+    status = StatusCode::kUnknownError;
+  }
+
+  if (message && !silent) {
+    fprintf(stderr, "CLBlast: %s\n", message);
+  }
+  return status;
+}
 // =================================================================================================
 
 StatusCode DispatchExceptionForC()
diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp
index a790be9c..9bd38187 100644
--- a/src/utilities/clblast_exceptions.hpp
+++ b/src/utilities/clblast_exceptions.hpp
@@ -37,7 +37,8 @@ class RuntimeErrorCode : public ErrorCode<RuntimeError, StatusCode> {
 // =================================================================================================
 
 // Handles (most of the) runtime exceptions and converts them to StatusCode
-StatusCode DispatchException();
+StatusCode DispatchException(const bool silent = false);
+StatusCode DispatchExceptionCatchAll(const bool silent = false);
 
 // Handles remaining exceptions and converts them to StatusCode::kUnhandledError
 StatusCode DispatchExceptionForC();
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
new file mode 100644
index 00000000..2a55506e
--- /dev/null
+++ b/src/utilities/compile.cpp
@@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the kernel compilation functions (see the header for more information).
+//
+// =================================================================================================
+
+#include <vector>
+#include <chrono>
+
+#include "routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+                          const std::string &routine_name,
+                          const Device& device, const Context& context,
+                          std::vector<std::string>& options, const bool silent) {
+  auto header_string = std::string{""};
+
+  header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
+
+  // Adds the name of the routine as a define
+  header_string += "#define ROUTINE_" + routine_name + "\n";
+
+  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
+  // which it is known to work with all OpenCL platforms.
+  if (device.IsNVIDIA() || device.IsARM()) {
+    header_string += "#define USE_INLINE_KEYWORD 1\n";
+  }
+
+  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
+  // performance, but might result in a reduced accuracy.
+  if (device.IsAMD() && device.IsGPU()) {
+    header_string += "#define USE_CL_MAD 1\n";
+  }
+
+  // For specific devices, use staggered/shuffled workgroup indices.
+  if (device.IsAMD() && device.IsGPU()) {
+    header_string += "#define USE_STAGGERED_INDICES 1\n";
+  }
+
+  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+  // performance through better cache behaviour
+  if (device.IsARM() && device.IsGPU()) {
+    header_string += "#define GLOBAL_MEM_FENCE 1\n";
+  }
+
+  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
+  #ifdef CUDA_API
+    source_string +=
+      #include "kernels/opencl_to_cuda.h"
+    ;
+  #endif
+
+  // Loads the common header (typedefs and defines and such)
+  header_string +=
+    #include "kernels/common.opencl"
+  ;
+
+  // Prints details of the routine to compile in case of debugging in verbose mode
+  #ifdef VERBOSE
+    printf("[DEBUG] Compiling routine '%s-%s'\n",
+           routine_name.c_str(), ToString(precision).c_str());
+    const auto start_time = std::chrono::steady_clock::now();
+  #endif
+
+  // Compiles the kernel
+  auto program = Program(context, header_string + source_string);
+  try {
+    program.Build(device, options);
+  } catch (const CLCudaAPIBuildError &e) {
+    if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) {
+      fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
+              program.GetBuildInfo(device).c_str());
+    }
+    throw;
+  }
+
+  // Prints the elapsed compilation time in case of debugging in verbose mode
+  #ifdef VERBOSE
+    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
+  #endif
+
+  return program;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp
new file mode 100644
index 00000000..0315d70c
--- /dev/null
+++ b/src/utilities/compile.hpp
@@ -0,0 +1,36 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the CLBlast way to compile a kernel from source, used for the library and for
+// the auto-tuners.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_UTILITIES_COMPILE_H_
+#define CLBLAST_UTILITIES_COMPILE_H_
+
+#include <string>
+#include <vector>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+                          const std::string &routine_name,
+                          const Device& device, const Context& context,
+                          std::vector<std::string>& options, const bool silent = false);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_UTILITIES_COMPILE_H_
+#endif
diff --git a/src/utilities/timing.cpp b/src/utilities/timing.cpp
new file mode 100644
index 00000000..af6a8ff2
--- /dev/null
+++ b/src/utilities/timing.cpp
@@ -0,0 +1,79 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides helper functions for time measurement and such.
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <exception>
+
+#include "utilities/timing.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                      std::vector<size_t> global, const std::vector<size_t> &local) {
+  auto event = Event();
+
+  if (!local.empty()) {
+    // Tests for validity of the local thread sizes
+    if (local.size() > device.MaxWorkItemDimensions()) {
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
+    }
+    const auto max_work_item_sizes = device.MaxWorkItemSizes();
+    for (auto i=size_t{0}; i<local.size(); ++i) {
+      if (local[i] > max_work_item_sizes[i]) {
+        throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
+      }
+    }
+    auto local_size = size_t{1};
+    for (auto &item: local) { local_size *= item; }
+    if (local_size > device.MaxWorkGroupSize()) {
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
+    }
+
+    // Make sure the global thread sizes are at least equal to the local sizes
+    for (auto i=size_t{0}; i<global.size(); ++i) {
+      if (global[i] < local[i]) { global[i] = local[i]; }
+    }
+  }
+
+  // Tests for local memory usage
+  const auto local_mem_usage = kernel.LocalMemUsage(device);
+  if (!device.IsLocalMemoryValid(local_mem_usage)) {
+    throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
+  }
+
+  // Times the kernel
+  const auto run_kernel_func = [&]() {
+      kernel.Launch(queue, global, local, event.pointer());
+      event.WaitForCompletion();
+      queue.Finish();
+  };
+  return TimeFunction(num_runs, run_kernel_func);
+}
+
+double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                  std::vector<size_t> global, const std::vector<size_t> &local) {
+  try {
+    const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local);
+    printf(" %9.2lf ms |", time_ms);
+    return time_ms;
+  }
+  catch (...) {
+    const auto status_code = DispatchExceptionCatchAll(true);
+    printf("  error %-5d |", static_cast<int>(status_code));
+    return -1.0; // invalid
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index bfad6147..a66aba4b 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -40,6 +40,14 @@ double TimeFunction(const size_t num_runs, F const &function) {
 
 // =================================================================================================
 
+double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                      std::vector<size_t> global, const std::vector<size_t> &local);
+
+double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+                  std::vector<size_t> global, const std::vector<size_t> &local);
+
+// =================================================================================================
+
 using Timing = std::pair<size_t, double>;
 
 template <typename T, typename F>
@@ -47,76 +55,27 @@ std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t
                                 const size_t num_runs, const Queue& queue,
                                 const std::vector<Buffer<T>>& buffers, F const &routine) {
   auto timings = std::vector<Timing>();
+  printf("|  value |         time |\n");
+  printf("x--------x--------------x\n");
   for (auto value = from; value < to; value += step) {
-    printf("[ RUN      ] Running with value %zu\n", value);
+    printf("| %6zu |", value);
     try {
       const auto FunctionToTune = [&]() { routine(value, queue, buffers); };
       const auto time_ms = TimeFunction(num_runs, FunctionToTune);
-      printf("[       OK ] Took %.2lf ms\n", time_ms);
+      printf(" %9.2lf ms |\n", time_ms);
       timings.push_back({value, time_ms});
     }
     catch (...) {
-      printf("[    ERROR ] Exception caught\n");
+      const auto status_code = DispatchExceptionCatchAll(true);
+      printf("  error %-5d |\n", static_cast<int>(status_code));
       timings.push_back({value, -1.0}); // invalid
     }
   }
+  printf("x--------x--------------x\n");
   return timings;
 }
 
 // =================================================================================================
-
-using TuningParameter = std::pair<std::string, size_t>;
-using TuningParameters = std::vector<TuningParameter>;
-struct TuningResult { std::string name; double score; TuningParameters parameters; };
-
-void PrintTimingsToFileAsJSON(const std::string &filename,
-                              const Device& device, const Platform& platform,
-                              const std::vector<std::pair<std::string,std::string>> &metadata,
-                              const std::vector<TuningResult>& tuning_results) {
-  printf("[  STATUS  ] Writing results to '%s'\n", filename.c_str());
-  auto file = fopen(filename.c_str(), "w");
-  fprintf(file, "{\n");
-  for (auto &datum: metadata) {
-    fprintf(file, "  \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
-  }
-  fprintf(file, "  \"platform_version\": \"%s\",\n", platform.Version().c_str());
-  fprintf(file, "  \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
-  fprintf(file, "  \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
-  fprintf(file, "  \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
-  fprintf(file, "  \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
-  fprintf(file, "  \"device_core_clock\": \"%zu\",\n", device.CoreClock());
-  fprintf(file, "  \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
-  fprintf(file, "  \"results\": [\n");
-
-  // Loops over all results
-  auto num_results = tuning_results.size();
-  for (auto r = size_t{0}; r < num_results; ++r) {
-    auto result = tuning_results[r];
-    fprintf(file, "    {\n");
-    fprintf(file, "      \"kernel\": \"%s\",\n", result.name.c_str());
-    fprintf(file, "      \"time\": %.3lf,\n", result.score);
-
-    // Loops over all the parameters for this result
-    fprintf(file, "      \"parameters\": {");
-    auto num_configs = result.parameters.size();
-    for (auto p=size_t{0}; p<num_configs; ++p) {
-      auto config = result.parameters[p];
-      fprintf(file, "\"%s\": %zu", config.first.c_str(), config.second);
-      if (p < num_configs-1) { fprintf(file, ","); }
-    }
-    fprintf(file, "}\n");
-
-    // The footer
-    fprintf(file, "    }");
-    if (r < num_results - 1) { fprintf(file, ","); }
-    fprintf(file, "\n");
-  }
-  fprintf(file, "  ]\n");
-  fprintf(file, "}\n");
-  fclose(file);
-}
-
-// =================================================================================================
 } // namespace clblast
 
 // CLBLAST_TIMING_H_
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index f2574104..1546fbf5 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -397,6 +397,37 @@ template <> bool PrecisionSupported<half>(const Device &device) { return device.
 
 // =================================================================================================
 
+// Retrieves the squared difference, used for example for computing the L2 error
+template <typename T>
+double SquaredDifference(const T val1, const T val2) {
+  const auto difference = (val1 - val2);
+  return static_cast<double>(difference * difference);
+}
+
+// Compiles the default case for standard data-types
+template double SquaredDifference<float>(const float, const float);
+template double SquaredDifference<double>(const double, const double);
+
+// Specialisations for non-standard data-types
+template <>
+double SquaredDifference(const float2 val1, const float2 val2) {
+  const auto real = SquaredDifference(val1.real(), val2.real());
+  const auto imag = SquaredDifference(val1.imag(), val2.imag());
+  return real + imag;
+}
+template <>
+double SquaredDifference(const double2 val1, const double2 val2) {
+  const auto real = SquaredDifference(val1.real(), val2.real());
+  const auto imag = SquaredDifference(val1.imag(), val2.imag());
+  return real + imag;
+}
+template <>
+double SquaredDifference(const half val1, const half val2) {
+  return SquaredDifference(HalfToFloat(val1), HalfToFloat(val2));
+}
+
+// =================================================================================================
+
 // High-level info
 std::string GetDeviceType(const Device& device) {
   return device.Type();
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index f56226be..e26721b3 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -98,6 +98,7 @@ constexpr auto kArgDilationW = "dilationw";
 // The tuner-specific arguments in string form
 constexpr auto kArgFraction = "fraction";
 constexpr auto kArgHeuristicSelection = "heuristic";
+constexpr auto kArgMaxL2Norm = "max_l2_norm";
 // PSO tuner-specific arguments in string form
 constexpr auto kArgPsoSwarmSize = "pso_swarm_size";
 constexpr auto kArgPsoInfGlobal = "pso_inf_global";
@@ -323,6 +324,12 @@ bool PrecisionSupported(const Device &device);
 
 // =================================================================================================
 
+// Retrieves the squared difference, used for example for computing the L2 error
+template <typename T>
+double SquaredDifference(const T val1, const T val2);
+
+// =================================================================================================
+
 // Device information in a specific CLBlast form
 std::string GetDeviceType(const Device& device);
 std::string GetDeviceVendor(const Device& device);
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index a10736ea..d6a346a6 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -539,37 +539,6 @@ bool TestSimilarity(const half val1, const half val2) {
 
 // =================================================================================================
 
-// Retrieves the squared difference, used for example for computing the L2 error
-template <typename T>
-double SquaredDifference(const T val1, const T val2) {
-  const auto difference = (val1 - val2);
-  return static_cast<double>(difference * difference);
-}
-
-// Compiles the default case for standard data-types
-template double SquaredDifference<float>(const float, const float);
-template double SquaredDifference<double>(const double, const double);
-
-// Specialisations for non-standard data-types
-template <>
-double SquaredDifference(const float2 val1, const float2 val2) {
-  const auto real = SquaredDifference(val1.real(), val2.real());
-  const auto imag = SquaredDifference(val1.imag(), val2.imag());
-  return real + imag;
-}
-template <>
-double SquaredDifference(const double2 val1, const double2 val2) {
-  const auto real = SquaredDifference(val1.real(), val2.real());
-  const auto imag = SquaredDifference(val1.imag(), val2.imag());
-  return real + imag;
-}
-template <>
-double SquaredDifference(const half val1, const half val2) {
-  return SquaredDifference(HalfToFloat(val1), HalfToFloat(val2));
-}
-
-// =================================================================================================
-
 // Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
 // routines. This function is specialised for the different data-types.
 template <> const std::vector<float> GetExampleScalars(const bool full_test) {
diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp
index 640f870a..7e17e53d 100644
--- a/test/correctness/tester.hpp
+++ b/test/correctness/tester.hpp
@@ -201,10 +201,6 @@ template <typename T> double getL2ErrorMargin();
 template <typename T>
 bool TestSimilarity(const T val1, const T val2);
 
-// Retrieves the squared difference, used for example for computing the L2 error
-template <typename T>
-double SquaredDifference(const T val1, const T val2);
-
 // Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
 // routines. This function is specialised for the different data-types.
 template <typename T>
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-11-19 20:05:15 +0100
committer	GitHub <noreply@github.com>	2017-11-19 20:05:15 +0100
commit	da76d7ab81555452a1049eb1a6d130073427067d (patch)
tree	92439d8bee44c34d63f288a73bdc372ba84dc42b
parent	c41d219ea42087c1b8d933b733b381005123cb91 (diff)
parent	defad3d1a249dd5f8c011cf28cc3c888d710d56a (diff)