26 files changed, 454 insertions, 151 deletions
diff --git a/.appveyor.yml b/.appveyor.yml
index 3feb175c..db99d08a 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -57,8 +57,8 @@ build_script:
 
 after_build:
   - ps: pushd $env:CLBLAST_BUILD
-  - 7z a CLBlast-1.3.0-Windows-x64.zip .\install_dir\*
-  - ps: mv CLBlast-1.3.0-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
+  - 7z a CLBlast-1.4.0-Windows-x64.zip .\install_dir\*
+  - ps: mv CLBlast-1.4.0-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
 
 artifacts:
   - path: '*.zip'
diff --git a/.travis.yml b/.travis.yml
index 125b6a10..17d9048c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,7 +21,7 @@ matrix:
 
 env:
   global:
-    - CLBLAST_VERSION=1.3.0
+    - CLBLAST_VERSION=1.4.0
     - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
     - CLBLAST_INSTALL=${TRAVIS_BUILD_DIR}/bin/CLBlast-${CLBLAST_VERSION}
     - CLBLAST_TAR=CLBlast-${CLBLAST_VERSION}-${TRAVIS_OS_NAME}-x64.tar.gz
diff --git a/CHANGELOG b/CHANGELOG
index c86ab70b..23052c83 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,5 @@
 
-Development (next version)
+Version 1.4.0
 - Added Python interface to CLBlast 'PyCLBlast'
 - Added CLBlast to Ubuntu PPA and macOS Homebrew package managers
 - Added an API to run the tuners programmatically without any I/O
@@ -8,7 +8,8 @@ Development (next version)
 - Re-added a local memory size constraint to the tuners
 - The routine tuners now automatically pick up tuning results from disk from the kernel tuners
 - Updated and reorganised the CLBlast documentation
-- Added a 'canary' region to check for overflows in the tuner and tests (insipred by clARMOR)
+- Added a 'canary' region to check for overflows in the tuner and tests (inspired by clARMOR)
+- Added an option to test against and compare performance with Intel's MKL
 - Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program
 - Fixed incorrect releasing of the OpenCL program resulting in segfaults / access violations
 - Various minor fixes and enhancements
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72aaa533..b1a6de5b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,7 @@ endif()
 # CMake project details
 project("clblast" C CXX)
 set(clblast_VERSION_MAJOR 1)
-set(clblast_VERSION_MINOR 3)
+set(clblast_VERSION_MINOR 4)
 set(clblast_VERSION_PATCH 0)
 set(clblast_VERSION "${clblast_VERSION_MAJOR}.${clblast_VERSION_MINOR}.${clblast_VERSION_PATCH}")
 set(clblast_SOVERSION ${clblast_VERSION_MAJOR})
@@ -170,16 +170,17 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL Android)
 else()
 
   # Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake",
-  # "FindCBLAS.cmake" and "FindcuBLAS.cmake" are included.
+  # "FindCBLAS.cmake", "FindMKL.cmake", and "FindcuBLAS.cmake" are included.
   if(CLIENTS OR TESTS)
     find_package(CBLAS)
+    find_package(MKL)
     if(OPENCL)
       find_package(clBLAS)
     endif()
     if(CUBLAS)
       find_package(cuBLAS)
     endif()
-    if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
+    if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND AND NOT MKL_FOUND)
       if(TESTS)
         message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
         set(TESTS OFF)
@@ -423,12 +424,14 @@ if(TUNERS)
     target_include_directories(clblast_tuner_${KERNEL} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
-  foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
-    add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp test/test_utilities.cpp)
-    target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
-    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS} ${clblast_SOURCE_DIR})
-    install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
-  endforeach()
+  if(OPENCL)
+    foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
+      add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp test/test_utilities.cpp)
+      target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
+      target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS} ${clblast_SOURCE_DIR})
+      install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
+    endforeach()
+  endif()
 
   # Adds 'alltuners' target: runs all tuners for all precisions
   set(ALLTUNERS )
@@ -439,12 +442,14 @@ if(TUNERS)
     endforeach()
     set(ALLTUNERSDEPENDS clblast_tuner_${KERNEL})
   endforeach()
-  foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
-    foreach(PRECISION ${PRECISIONS})
-      set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_routine_${ROUTINE_TUNER} -precision ${PRECISION})
+  if(OPENCL)
+    foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
+      foreach(PRECISION ${PRECISIONS})
+        set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_routine_${ROUTINE_TUNER} -precision ${PRECISION})
+      endforeach()
+      set(ALLTUNERSDEPENDS clblast_tuner_routine_${ROUTINE_TUNER})
     endforeach()
-    set(ALLTUNERSDEPENDS clblast_tuner_routine_${ROUTINE_TUNER})
-  endforeach()
+  endif()
   add_custom_target(alltuners ${ALLTUNERS} DEPENDS ${ALLTUNERSDEPENDS})
 
 endif()
@@ -468,9 +473,19 @@ if(CLIENTS OR TESTS)
       add_definitions(" -DCLBLAST_REF_CLBLAS")
     endif()
   endif()
-  if(CBLAS_FOUND)
-    set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
-    set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
+  if(CBLAS_FOUND OR MKL_FOUND)
+    if(MKL_FOUND)  # prefers MKL over another CBLAS implementation
+      set(REF_INCLUDES ${REF_INCLUDES} ${MKL_INCLUDE_DIRS})
+      set(REF_LIBRARIES ${REF_LIBRARIES} ${MKL_LIBRARIES})
+      if(MSVC)
+        add_definitions(" /DCLBLAST_REF_CBLAS_MKL")
+      else()
+        add_definitions(" -DCLBLAST_REF_CBLAS_MKL")
+      endif()
+    else()
+      set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
+      set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
+    endif()
     set(WRAPPERS ${WRAPPERS} test/wrapper_cblas.hpp)
     if(MSVC)
       add_definitions(" /DCLBLAST_REF_CBLAS")
diff --git a/README.md b/README.md
index 0070a84c..28fd42d2 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,9 @@
 CLBlast: The tuned OpenCL BLAS library
 ================
 
-| | Build status | Tests on Intel CPU | Tests on NVIDIA GPU | Tests on Intel GPU |
+| | Build status | Tests on Intel CPU | Tests on NVIDIA GPU | Other tests |
 |-----|-----|-----|-----|-----|
-| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-i7-4790k) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-nvidia-k5000.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-nvidia-k5000) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-HD4600.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-HD4600) |
+| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-i7-4790k) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-nvidia-k5000.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-nvidia-k5000) | N/A |
 | Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-intel-e5-2620-v4.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-intel-e5-2620-v4) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-nvidia-k80.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-nvidia-k80) | N/A |
 | OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) |  [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-osx-intel-i5-4278U.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-osx-intel-i5-4278U) | N/A | N/A |
 
diff --git a/ROADMAP.md b/ROADMAP.md
index 9b870523..c6f9f4f1 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -18,8 +18,8 @@ This file gives an overview of the main features planned for addition to CLBlast
 | [#223](https://github.com/CNugteren/CLBlast/issues/223)        | Feb '18     | CNugteren | ✔      | Python OpenCL interface |
 | [#237](https://github.com/CNugteren/CLBlast/issues/237)        | Mar '18     | CNugteren | ✔      | Making tuning possible from the CLBlast API |
 | [#228](https://github.com/CNugteren/CLBlast/issues/228)        | Mar-Apr '18 | CNugteren | ✔      | Improving performance for Qualcomm Adreno GPUs |
-| [#267](https://github.com/CNugteren/CLBlast/issues/267)        | May '18     | CNugteren |        | Merge im2col and GEMM into a direct kernel |
-| [#270](https://github.com/CNugteren/CLBlast/issues/270)        | July '18    | CNugteren |        | Implement col2im |
-| -                                                              | July '18    | CNugteren |        | Add a SYCL interface to the library |
+| [#267](https://github.com/CNugteren/CLBlast/issues/267)        | July '18    | CNugteren |        | Merge im2col and GEMM into a direct kernel |
+| [#270](https://github.com/CNugteren/CLBlast/issues/270)        | Aug '18     | CNugteren |        | Implement col2im |
+| -                                                              | Aug '18     | CNugteren |        | Add a SYCL interface to the library |
 | [#136](https://github.com/CNugteren/CLBlast/issues/136)        | ??          | CNugteren |        | Implement xAXPBY and xSET |
 | [#169](https://github.com/CNugteren/CLBlast/issues/169)        | ??          | dividiti  |        | Problem-specific tuning parameter selection |
diff --git a/cmake/Modules/FindCBLAS.cmake b/cmake/Modules/FindCBLAS.cmake
index 1439bfcb..fa97ce9f 100644
--- a/cmake/Modules/FindCBLAS.cmake
+++ b/cmake/Modules/FindCBLAS.cmake
@@ -48,7 +48,7 @@ mark_as_advanced(CBLAS_INCLUDE_DIRS)
 
 # Finds the library
 find_library(CBLAS_LIBRARIES
-  NAMES cblas blas mkl blis openblas accelerate
+  NAMES cblas blas blis openblas accelerate
   HINTS ${CBLAS_HINTS}
   PATH_SUFFIXES
     lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
new file mode 100644
index 00000000..df481a4d
--- /dev/null
+++ b/cmake/Modules/FindMKL.cmake
@@ -0,0 +1,72 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+# width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# ==================================================================================================
+#
+# Defines the following variables:
+#   MKL_FOUND          Boolean holding whether or not the Intel MKL BLAS library was found
+#   MKL_INCLUDE_DIRS   The Intel MKL BLAS include directory
+#   MKL_LIBRARIES      The Intel MKL BLAS library
+#
+# In case MKL is not installed in the default directory, set the MKL_ROOT variable to point to
+# the root of MKL, such that 'mkl_cblas.h' can be found in $MKL_ROOT/include. This can either be
+# done using an environmental variable (e.g. export MKL_ROOT=/path/to/MKL) or using a CMake
+# variable (e.g. cmake -DMKL_ROOT=/path/to/MKL ..).
+#
+# ==================================================================================================
+
+# Sets the possible install locations
+set(MKL_HINTS
+  ${MKL_ROOT}
+  $ENV{MKL_ROOT}
+)
+set(MKL_PATHS
+  /usr
+  /usr/local
+  /usr/local/opt
+  /usr/local/mkl
+  /opt/intel
+  /opt/intel/mkl
+)
+
+# Finds the include directories
+find_path(MKL_INCLUDE_DIRS
+  NAMES mkl_cblas.h
+  HINTS ${MKL_HINTS}
+  PATH_SUFFIXES
+    include inc include/x86_64 include/x64
+  PATHS ${MKL_PATHS}
+  DOC "Intel MKL CBLAS include header mkl_cblas.h"
+)
+mark_as_advanced(MKL_INCLUDE_DIRS)
+
+# Finds the libraries
+set(MKL_LIB_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import lib/intel64)
+find_library(MKL_LIBRARIES_LP64 NAMES mkl_intel_lp64 HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL lp64 library")
+find_library(MKL_LIBRARIES_THREAD NAMES mkl_intel_thread HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL thread library")
+find_library(MKL_LIBRARIES_CORE NAMES mkl_core HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL core library")
+find_library(MKL_LIBRARIES_OMP NAMES iomp5 HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel OpenMP library")
+set(MKL_LIBRARIES ${MKL_LIBRARIES_LP64} ${MKL_LIBRARIES_THREAD} ${MKL_LIBRARIES_CORE} ${MKL_LIBRARIES_OMP})
+mark_as_advanced(MKL_LIBRARIES)
+
+# ==================================================================================================
+
+# Notification messages
+if(NOT MKL_INCLUDE_DIRS)
+    message(STATUS "Could NOT find 'mkl_cblas.h', install MKL or set MKL_ROOT")
+endif()
+if(NOT MKL_LIBRARIES)
+    message(STATUS "Could NOT find the Intel MKL BLAS library, install it or set MKL_ROOT")
+endif()
+
+# Determines whether or not MKL was found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MKL DEFAULT_MSG MKL_INCLUDE_DIRS MKL_LIBRARIES)
+
+# ==================================================================================================
diff --git a/doc/tuning.md b/doc/tuning.md
index b5186ac6..5cf32ca8 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -84,6 +84,74 @@ The kernels `gemm` and `gemm_direct` have too many parameters to explore. Theref
 
 There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. However, they do automatically pick up kernel tuning results from the current folder if there are any. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.
 
+Here are all the tuners included in the `make alltuners` target (in the same order) with all their precision arguments:
+
+    ./clblast_tuner_copy_fast -precision 32
+    ./clblast_tuner_copy_fast -precision 64
+    ./clblast_tuner_copy_fast -precision 3232
+    ./clblast_tuner_copy_fast -precision 6464
+    ./clblast_tuner_copy_fast -precision 16
+    ./clblast_tuner_copy_pad -precision 32
+    ./clblast_tuner_copy_pad -precision 64
+    ./clblast_tuner_copy_pad -precision 3232
+    ./clblast_tuner_copy_pad -precision 6464
+    ./clblast_tuner_copy_pad -precision 16
+    ./clblast_tuner_transpose_fast -precision 32
+    ./clblast_tuner_transpose_fast -precision 64
+    ./clblast_tuner_transpose_fast -precision 3232
+    ./clblast_tuner_transpose_fast -precision 6464
+    ./clblast_tuner_transpose_fast -precision 16
+    ./clblast_tuner_transpose_pad -precision 32
+    ./clblast_tuner_transpose_pad -precision 64
+    ./clblast_tuner_transpose_pad -precision 3232
+    ./clblast_tuner_transpose_pad -precision 6464
+    ./clblast_tuner_transpose_pad -precision 16
+    ./clblast_tuner_xaxpy -precision 32
+    ./clblast_tuner_xaxpy -precision 64
+    ./clblast_tuner_xaxpy -precision 3232
+    ./clblast_tuner_xaxpy -precision 6464
+    ./clblast_tuner_xaxpy -precision 16
+    ./clblast_tuner_xdot -precision 32
+    ./clblast_tuner_xdot -precision 64
+    ./clblast_tuner_xdot -precision 3232
+    ./clblast_tuner_xdot -precision 6464
+    ./clblast_tuner_xdot -precision 16
+    ./clblast_tuner_xger -precision 32
+    ./clblast_tuner_xger -precision 64
+    ./clblast_tuner_xger -precision 3232
+    ./clblast_tuner_xger -precision 6464
+    ./clblast_tuner_xger -precision 16
+    ./clblast_tuner_xgemm -precision 32
+    ./clblast_tuner_xgemm -precision 64
+    ./clblast_tuner_xgemm -precision 3232
+    ./clblast_tuner_xgemm -precision 6464
+    ./clblast_tuner_xgemm -precision 16
+    ./clblast_tuner_xgemm_direct -precision 32
+    ./clblast_tuner_xgemm_direct -precision 64
+    ./clblast_tuner_xgemm_direct -precision 3232
+    ./clblast_tuner_xgemm_direct -precision 6464
+    ./clblast_tuner_xgemm_direct -precision 16
+    ./clblast_tuner_xgemv -precision 32
+    ./clblast_tuner_xgemv -precision 64
+    ./clblast_tuner_xgemv -precision 3232
+    ./clblast_tuner_xgemv -precision 6464
+    ./clblast_tuner_xgemv -precision 16
+    ./clblast_tuner_invert -precision 32
+    ./clblast_tuner_invert -precision 64
+    ./clblast_tuner_invert -precision 3232
+    ./clblast_tuner_invert -precision 6464
+    ./clblast_tuner_invert -precision 16
+    ./clblast_tuner_routine_xgemm -precision 32
+    ./clblast_tuner_routine_xgemm -precision 64
+    ./clblast_tuner_routine_xgemm -precision 3232
+    ./clblast_tuner_routine_xgemm -precision 6464
+    ./clblast_tuner_routine_xgemm -precision 16
+    ./clblast_tuner_routine_xtrsv -precision 32
+    ./clblast_tuner_routine_xtrsv -precision 64
+    ./clblast_tuner_routine_xtrsv -precision 3232
+    ./clblast_tuner_routine_xtrsv -precision 6464
+    ./clblast_tuner_routine_xtrsv -precision 16
+
 
 Using the tuning results
 -------------
diff --git a/src/cupp11.hpp b/src/cupp11.hpp
index 509ae3e8..a1cb1614 100644
--- a/src/cupp11.hpp
+++ b/src/cupp11.hpp
@@ -678,8 +678,8 @@ public:
   }
 
   // Regular constructor with memory management
-  explicit Kernel(const Program &program, const std::string &name): name_(name) {
-    CheckError(cuModuleGetFunction(&kernel_, program.GetModule(), name.c_str()));
+  explicit Kernel(const std::shared_ptr<Program> program, const std::string &name): name_(name) {
+    CheckError(cuModuleGetFunction(&kernel_, program->GetModule(), name.c_str()));
   }
 
   // Sets a kernel argument at the indicated position. This stores both the value of the argument
diff --git a/src/database/apple_cpu_fallback.hpp b/src/database/apple_cpu_fallback.hpp
index fdd9327d..55bcc220 100644
--- a/src/database/apple_cpu_fallback.hpp
+++ b/src/database/apple_cpu_fallback.hpp
@@ -41,7 +41,7 @@ const DatabaseEntry XgerApple = {
   "Xger", Precision::kAny, {"WGS1", "WGS2", "WPT"}, { {  kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
 };
 const DatabaseEntry XtrsvApple = {
-  "Xtrsv", Precision::kAny, {"TRSV_BLOCK_SIZE"}, { {  kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
+  "Xtrsv", Precision::kAny, {"TRSV_BLOCK_SIZE"}, { {  kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
 };
 const DatabaseEntry XgemmApple = {
   "Xgemm", Precision::kAny, {"GEMMK", "KREG", "KWG", "KWI", "MDIMA", "MDIMC", "MWG", "NDIMB", "NDIMC", "NWG", "SA", "SB", "STRM", "STRN", "VWM", "VWN"}, { {  kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1 } } } } } } }
@@ -62,7 +62,10 @@ const DatabaseEntry PadtransposeApple = {
   "Padtranspose", Precision::kAny, {"PADTRA_PAD", "PADTRA_TILE", "PADTRA_WPT"}, { {  kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
 };
 const DatabaseEntry InvertApple = {
-  "Invert", Precision::kAny, {"INTERNAL_BLOCK_SIZE"}, { {  kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
+  "Invert", Precision::kAny, {"INTERNAL_BLOCK_SIZE"}, { {  kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
+};
+const DatabaseEntry TrsvRoutineApple = {
+  "TrsvRoutine", Precision::kAny, {"TRSV_BLOCK_SIZE"}, { {  kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
 };
 
 // =================================================================================================
diff --git a/src/database/database.cpp b/src/database/database.cpp
index b2f70e49..fca3102d 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -45,7 +45,8 @@ const std::vector<database::DatabaseEntry> Database::apple_cpu_fallback = std::v
   database::XgemvApple, database::XgemvFastApple, database::XgemvFastRotApple, database::XgerApple, database::XtrsvApple,
   database::XgemmApple, database::XgemmDirectApple,
   database::CopyApple, database::PadApple, database::TransposeApple, database::PadtransposeApple,
-  database::InvertApple
+  database::InvertApple,
+  database::TrsvRoutineApple
 };
 
 // The default values
@@ -98,7 +99,8 @@ Database::Database(const Device &device, const std::string &kernel_name,
     if (device.Type() == "CPU") {
       const auto extensions = device.Capabilities();
       const auto is_apple = (extensions.find("cl_APPLE_SetMemObjectDestructor") == std::string::npos) ? false : true;
-      if (is_apple) {
+      const auto is_likely_apple = device.MaxWorkGroupSize() <= 32;
+      if (is_apple || is_likely_apple) {
         databases.push_front(apple_cpu_fallback);
       }
     }
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
index 8777eb77..e7b6ae79 100644
--- a/src/kernels/level2/xtrsv.opencl
+++ b/src/kernels/level2/xtrsv.opencl
@@ -18,7 +18,7 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_TRSV)
 
-__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
+__kernel
 void FillVector(const int n, const int inc, const int offset,
                 __global real* restrict dest, const real_arg arg_value) {
   const real value = GetRealArg(arg_value);
diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
index 8736203c..8e9b583e 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
@@ -19,7 +19,7 @@ R"(
 #if defined(ROUTINE_INVERT)
 
 // B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -28,7 +28,7 @@ void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const in
 }
 
 // B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];
@@ -36,7 +36,7 @@ void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_s
 }
 
 // B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -45,7 +45,7 @@ void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const in
 }
 
 // B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];
@@ -53,7 +53,7 @@ void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_s
 }
 
 // B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -62,7 +62,7 @@ void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const in
 }
 
 // B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];
@@ -72,7 +72,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
 // =================================================================================================
 
 // B12 =  A12 * B22
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -81,7 +81,7 @@ void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const in
 }
 
 // B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];
@@ -89,7 +89,7 @@ void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_s
 }
 
 // B12 =  A12 * B22
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -98,7 +98,7 @@ void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const in
 }
 
 // B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];
@@ -106,7 +106,7 @@ void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_s
 }
 
 // B12 =  A12 * B22
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -115,7 +115,7 @@ void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const in
 }
 
 // B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+__kernel
 void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];
diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl
index c67851df..bea73daf 100644
--- a/src/kernels/level3/level3.opencl
+++ b/src/kernels/level3/level3.opencl
@@ -76,7 +76,7 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
 
-__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
+__kernel
 void FillMatrix(const int m, const int n, const int ld, const int offset,
                 __global real* restrict dest, const real_arg arg_value) {
   const real value = GetRealArg(arg_value);
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index 5b80e3f2..695785c4 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -13,6 +13,7 @@
 
 #include <vector>
 #include <chrono>
+#include <iostream>
 
 #include "routines/common.hpp"
 
@@ -38,13 +39,22 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
     auto local_size = size_t{1};
     for (auto &item: local) { local_size *= item; }
     if (local_size > device.MaxWorkGroupSize()) {
-      throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal,
+                             ToString(local_size) + " is larger than " + ToString(device.MaxWorkGroupSize()));
     }
 
     // Make sure the global thread sizes are at least equal to the local sizes
     for (auto i=size_t{0}; i<global.size(); ++i) {
       if (global[i] < local[i]) { global[i] = local[i]; }
     }
+
+    // Verify that the global thread sizes are a multiple of the local sizes
+    for (auto i=size_t{0}; i<global.size(); ++i) {
+      if ((global[i] / local[i]) * local[i] != global[i]) {
+        throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim,
+                               ToString(global[i]) + " is not divisible by " + ToString(local[i]));
+      }
+    }
   }
 
   // Tests for local memory usage
@@ -77,11 +87,10 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 // Sets all elements of a matrix to a constant value
 template <typename T>
 void FillMatrix(Queue &queue, const Device &device,
-                const std::shared_ptr<Program> program, const Databases &,
+                const std::shared_ptr<Program> program,
                 EventPointer event, const std::vector<Event> &waitForEvents,
                 const size_t m, const size_t n, const size_t ld, const size_t offset,
-                const Buffer<T> &dest,
-                const T constant_value) {
+                const Buffer<T> &dest, const T constant_value, const size_t local_size) {
   auto kernel = Kernel(program, "FillMatrix");
   kernel.SetArgument(0, static_cast<int>(m));
   kernel.SetArgument(1, static_cast<int>(n));
@@ -89,63 +98,62 @@ void FillMatrix(Queue &queue, const Device &device,
   kernel.SetArgument(3, static_cast<int>(offset));
   kernel.SetArgument(4, dest());
   kernel.SetArgument(5, GetRealArg(constant_value));
-  auto local = std::vector<size_t>{16, 1};
-  auto global = std::vector<size_t>{Ceil(m, 16), n};
+  auto local = std::vector<size_t>{local_size, 1};
+  auto global = std::vector<size_t>{Ceil(m, local_size), n};
   RunKernel(kernel, queue, device, global, local, event, waitForEvents);
 }
 
 // Compiles the above function
-template void FillMatrix<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
+template void FillMatrix<half>(Queue&, const Device&, const std::shared_ptr<Program>,
                                EventPointer, const std::vector<Event>&, const size_t, const size_t,
-                               const size_t, const size_t, const Buffer<half>&, const half);
-template void FillMatrix<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
+                               const size_t, const size_t, const Buffer<half>&, const half, const size_t);
+template void FillMatrix<float>(Queue&, const Device&, const std::shared_ptr<Program>,
                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
-                                const size_t, const size_t, const Buffer<float>&, const float);
-template void FillMatrix<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
+                                const size_t, const size_t, const Buffer<float>&, const float, const size_t);
+template void FillMatrix<double>(Queue&, const Device&, const std::shared_ptr<Program>,
                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
-                                 const size_t, const size_t, const Buffer<double>&, const double);
-template void FillMatrix<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
+                                 const size_t, const size_t, const Buffer<double>&, const double, const size_t);
+template void FillMatrix<float2>(Queue&, const Device&, const std::shared_ptr<Program>,
                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
-                                 const size_t, const size_t, const Buffer<float2>&, const float2);
-template void FillMatrix<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
+                                 const size_t, const size_t, const Buffer<float2>&, const float2, const size_t);
+template void FillMatrix<double2>(Queue&, const Device&, const std::shared_ptr<Program>,
                                   EventPointer, const std::vector<Event>&, const size_t, const size_t,
-                                  const size_t, const size_t, const Buffer<double2>&, const double2);
+                                  const size_t, const size_t, const Buffer<double2>&, const double2, const size_t);
 
 // Sets all elements of a vector to a constant value
 template <typename T>
 void FillVector(Queue &queue, const Device &device,
-                const std::shared_ptr<Program> program, const Databases &,
+                const std::shared_ptr<Program> program,
                 EventPointer event, const std::vector<Event> &waitForEvents,
                 const size_t n, const size_t inc, const size_t offset,
-                const Buffer<T> &dest,
-                const T constant_value) {
+                const Buffer<T> &dest, const T constant_value, const size_t local_size) {
   auto kernel = Kernel(program, "FillVector");
   kernel.SetArgument(0, static_cast<int>(n));
   kernel.SetArgument(1, static_cast<int>(inc));
   kernel.SetArgument(2, static_cast<int>(offset));
   kernel.SetArgument(3, dest());
   kernel.SetArgument(4, GetRealArg(constant_value));
-  auto local = std::vector<size_t>{16};
-  auto global = std::vector<size_t>{Ceil(n, 16)};
+  auto local = std::vector<size_t>{local_size};
+  auto global = std::vector<size_t>{Ceil(n, local_size)};
   RunKernel(kernel, queue, device, global, local, event, waitForEvents);
 }
 
 // Compiles the above function
-template void FillVector<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
+template void FillVector<half>(Queue&, const Device&, const std::shared_ptr<Program>,
                                EventPointer, const std::vector<Event>&, const size_t, const size_t,
-                               const size_t, const Buffer<half>&, const half);
-template void FillVector<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
+                               const size_t, const Buffer<half>&, const half, const size_t);
+template void FillVector<float>(Queue&, const Device&, const std::shared_ptr<Program>,
                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
-                                const size_t, const Buffer<float>&, const float);
-template void FillVector<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
+                                const size_t, const Buffer<float>&, const float, const size_t);
+template void FillVector<double>(Queue&, const Device&, const std::shared_ptr<Program>,
                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
-                                 const size_t, const Buffer<double>&, const double);
-template void FillVector<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
+                                 const size_t, const Buffer<double>&, const double, const size_t);
+template void FillVector<float2>(Queue&, const Device&, const std::shared_ptr<Program>,
                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
-                                 const size_t, const Buffer<float2>&, const float2);
-template void FillVector<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
+                                 const size_t, const Buffer<float2>&, const float2, const size_t);
+template void FillVector<double2>(Queue&, const Device&, const std::shared_ptr<Program>,
                                   EventPointer, const std::vector<Event>&, const size_t, const size_t,
-                                  const size_t, const Buffer<double2>&, const double2);
+                                  const size_t, const Buffer<double2>&, const double2, const size_t);
 
 // =================================================================================================
 } // namespace clblast
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index b909243d..c30a2e0e 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -36,20 +36,18 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 // Sets all elements of a matrix to a constant value
 template <typename T>
 void FillMatrix(Queue &queue, const Device &device,
-                const std::shared_ptr<Program> program, const Databases &,
+                const std::shared_ptr<Program> program,
                 EventPointer event, const std::vector<Event> &waitForEvents,
                 const size_t m, const size_t n, const size_t ld, const size_t offset,
-                const Buffer<T> &dest,
-                const T constant_value);
+                const Buffer<T> &dest, const T constant_value, const size_t local_size);
 
 // Sets all elements of a vector to a constant value
 template <typename T>
 void FillVector(Queue &queue, const Device &device,
-                const std::shared_ptr<Program> program, const Databases &,
+                const std::shared_ptr<Program> program,
                 EventPointer event, const std::vector<Event> &waitForEvents,
                 const size_t n, const size_t inc, const size_t offset,
-                const Buffer<T> &dest,
-                const T constant_value);
+                const Buffer<T> &dest, const T constant_value, const size_t local_size);
 
 // =================================================================================================
 
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
index 36c33a76..76401753 100644
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@@ -68,7 +68,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
 
   // Launches the kernel
   const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
-  const auto global = std::vector<size_t>{1};
+  const auto global = std::vector<size_t>{Ceil(n, db_["TRSV_BLOCK_SIZE"])};
   auto event = Event();
   RunKernel(kernel, queue_, device_, global, local, event.pointer());
   event.WaitForCompletion();
@@ -87,6 +87,11 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
   // Makes sure all dimensions are larger than zero
   if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
 
+  // Some parts of this kernel are not tunable and thus require some minimal OpenCL properties
+  if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16
+    throw RuntimeErrorCode(StatusCode::kNotImplemented);
+  }
+
   // Tests the matrix and vector
   TestMatrixA(n, n, a_buffer, a_offset, a_ld);
   TestVectorX(n, b_buffer, b_offset, b_inc);
@@ -102,8 +107,8 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
   // Fills the output buffer with zeros
   auto eventWaitList = std::vector<Event>();
   auto fill_vector_event = Event();
-  FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList,
-             n, x_inc, x_offset, x_buffer, ConstantZero<T>());
+  FillVector(queue_, device_, program_, fill_vector_event.pointer(), eventWaitList,
+             n, x_inc, x_offset, x_buffer, ConstantZero<T>(), 16);
   fill_vector_event.WaitForCompletion();
 
   // Derives properties based on the arguments
diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp
index ec84fbb7..ed8cc69d 100644
--- a/src/routines/level3/xgemm.hpp
+++ b/src/routines/level3/xgemm.hpp
@@ -25,9 +25,9 @@ class Xgemm: public Routine {
  public:
 
   // Defines the assumptions of the GEMM kernels
-  static const bool a_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
-  static const bool b_want_rotated_(const size_t gemm_kernel_id) { return true; }
-  static const bool c_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
+  static bool a_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
+  static bool b_want_rotated_(const size_t) { return true; }
+  static bool c_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
 
   // Computes the size of the temporary GEMM buffer based on user-arguments
   static size_t GetTempSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
index d622e3bf..905660ff 100644
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@@ -78,6 +78,11 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
   // Makes sure all dimensions are larger than zero
   if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
 
+  // Some parts of this kernel are not tunable and thus require some minimal OpenCL properties
+  if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16
+    throw RuntimeErrorCode(StatusCode::kNotImplemented);
+  }
+
   // Computes the k dimension. This is based on whether or not matrix is A (on the left)
   // or B (on the right) in the Xgemm routine.
   const auto k = (side == Side::kLeft) ? m : n;
@@ -105,8 +110,8 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
   // Fills the output buffer with zeros
   auto eventWaitList = std::vector<Event>();
   auto fill_matrix_event = Event();
-  FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList,
-             x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero<T>());
+  FillMatrix(queue_, device_, program_, fill_matrix_event.pointer(), eventWaitList,
+             x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero<T>(), 16);
   fill_matrix_event.WaitForCompletion();
 
   // Inverts the diagonal blocks
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
index a5ef9e10..eea8527a 100644
--- a/src/routines/levelx/xinvert.cpp
+++ b/src/routines/levelx/xinvert.cpp
@@ -49,9 +49,16 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
     throw BLASError(StatusCode::kInvalidDimension);
   }
 
+  // Some parts of this kernel are not tunable and thus require some minimal OpenCL properties
+  if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16
+    throw RuntimeErrorCode(StatusCode::kNotImplemented);
+  }
+
   // Helper variables
   const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
-  assert(internal_block_size == 16);
+  if (internal_block_size != 16) {
+    throw RuntimeErrorCode(StatusCode::kNotImplemented); // e.g. Apple CPU OpenCL with a WGS of 1
+  }                                                      // when barriers are present
   const auto num_blocks = CeilDiv(n, block_size);
   const auto num_internal_blocks = CeilDiv(n, internal_block_size);
   const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;
@@ -75,8 +82,9 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
   // Fills the output buffer with zeros
   auto event_wait_list = std::vector<Event>();
   auto fill_matrix_event = Event();
-  FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list,
-             block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
+  FillMatrix(queue_, device_, program_, fill_matrix_event.pointer(), event_wait_list,
+             block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>(),
+             16);
   event_wait_list.push_back(fill_matrix_event);
 
   // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
@@ -89,11 +97,11 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
   kernel.SetArgument(5, static_cast<int>(block_size));
   kernel.SetArgument(6, static_cast<int>(unit_diagonal));
   kernel.SetArgument(7, static_cast<int>(is_upper));
-  const auto local = std::vector<size_t>{internal_block_size};
-  const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
+  const auto local_invert = std::vector<size_t>{internal_block_size};
+  const auto global_invert = std::vector<size_t>{num_internal_blocks * internal_block_size};
   auto base_kernel_event = Event();
   auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
-  RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
+  RunKernel(kernel, queue_, device_, global_invert, local_invert, base_kernel_event_pointer, event_wait_list);
   if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
 
   // Builds up block_size x block_size blocks. For example, internal_block_size=16:
@@ -107,7 +115,8 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
     const auto npages = CeilDiv(n, current_size*2);
     const auto local0 = (current_size <= 32) ? current_size/4 : 16;
     const auto local = std::vector<size_t>{local0, 4};
-    const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
+    const auto global = std::vector<size_t>{Ceil(current_size/local[1], local[0]),
+                                            Ceil(npages*(current_size/16)*local[1], local[1])};
 
     // Part 1
     auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index 92aab611..7d886ebf 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -25,14 +25,15 @@ namespace clblast {
 // =================================================================================================
 
 template <typename T>
-void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
+void RunGemmRoutineMNK(const size_t m, const size_t n, const size_t k,
+                       const Queue& queue, const std::vector<Buffer<T>>& buffers) {
   auto queue_plain = queue();
   auto event = cl_event{};
   auto status = Gemm(Layout::kRowMajor, Transpose::kNo, Transpose::kNo,
-                     value, value, value, ConstantOne<T>(),
-                     buffers[0](), 0, value,
-                     buffers[1](), 0, value, ConstantOne<T>(),
-                     buffers[2](), 0, value,
+                     m, n, k, ConstantOne<T>(),
+                     buffers[0](), 0, k,
+                     buffers[1](), 0, n, ConstantOne<T>(),
+                     buffers[2](), 0, n,
                      &queue_plain, &event);
   if (status != StatusCode::kSuccess) {
     throw RuntimeError("Gemm failed with status " + ToString(status));
@@ -40,6 +41,10 @@ void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Bu
   clWaitForEvents(1, &event);
   clReleaseEvent(event);
 }
+template <typename T>
+void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
+  RunGemmRoutineMNK(value, value, value, queue, buffers);
+}
 
 template <typename T, size_t batch_count>
 void RunGemmBatchedRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
@@ -80,6 +85,55 @@ void RunGemmStridedBatchedRoutine(const size_t value, const Queue& queue, const
   clWaitForEvents(1, &event);
   clReleaseEvent(event);
 }
+// =================================================================================================
+
+template <typename T>
+void TuneGemmSingleSize(const Platform& platform, const Device& device, const Context& context, Queue& queue,
+                        const size_t m, const size_t n, const size_t k, const size_t num_runs) {
+
+  // Buffers
+  auto buffers = std::vector<Buffer<T>>{
+      Buffer<T>(context, m * k),
+      Buffer<T>(context, k * n),
+      Buffer<T>(context, m * n)
+  };
+  const auto FunctionToTune = [&]() { RunGemmRoutineMNK(m, n, k, queue, buffers); };
+
+  // Collects the timings for two methods
+  auto scores = std::vector<TuningResult>();
+  const auto methods = std::vector<std::string>{"in-direct", "direct"};
+  for (auto& method: methods) {
+
+    printf("* Testing the %s routine\n", method.c_str());
+    const auto limit = (method == "in-direct") ? 0 : std::max(std::max(m, n), k) + 1; // small or large number
+    ForceSelectIndirectFrom<T>(limit, device, "GemmRoutine", "XGEMM_MIN_INDIRECT_SIZE");
+    auto time_ms = -1.0;
+    try {
+      time_ms = TimeFunction(num_runs, FunctionToTune);
+      printf("  --> %9.2lf ms\n", time_ms);
+    }
+    catch (...) {
+      const auto status_code = DispatchExceptionCatchAll(true);
+      printf("  --> error %-5d\n", static_cast<int>(status_code));
+    }
+    auto tuning_results = Configuration();
+    tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = limit;
+    tuning_results["PRECISION"] = static_cast<size_t>(PrecisionValue<T>());
+    scores.push_back(TuningResult{"gemm_kernel_selection_single_size", time_ms, tuning_results});
+  }
+
+  // Outputs the results as JSON to disk, including some meta-data
+  const auto precision_string = std::to_string(static_cast<size_t>(PrecisionValue<T>()));
+  auto metadata = std::vector<std::pair<std::string,std::string>>{
+      {"kernel_family", "gemm_routine_single_size"},
+      {"precision", precision_string},
+      {"arg_m", ToString(m)},
+      {"arg_n", ToString(n)},
+      {"arg_k", ToString(k)},
+  };
+  PrintTimingsToFileAsJSON("clblast_gemm_routine_single_size_" + precision_string + ".json",
+                           device, platform, metadata, scores);
+}
 
 // =================================================================================================
 
@@ -91,6 +145,9 @@ void TuneXgemm(int argc, char* argv[]) {
   const auto device_id   = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
   const auto precision   = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle);
   const auto num_runs    = GetArgument(command_line_args, help, kArgNumRuns, size_t{10});
+  const auto arg_m       = GetArgument(command_line_args, help, kArgM, -1); // optional
+  const auto arg_n       = GetArgument(command_line_args, help, kArgN, -1); // optional
+  const auto arg_k       = GetArgument(command_line_args, help, kArgK, -1); // optional
   fprintf(stdout, "%s\n", help.c_str());
 
   // OpenCL initialisation
@@ -119,16 +176,29 @@ void TuneXgemm(int argc, char* argv[]) {
     }
   }
 
-  // Run the tuners for the XGEMM routines
-  TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
-                         64, 2048, 64, 1, num_runs,
-                         "gemm", "GemmRoutine", "gemm_routine", "XGEMM_MIN_INDIRECT_SIZE");
-  //TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmBatchedRoutine<T, 30>,
-  //                       16, 128, 32, 30, num_runs,
-  //                       "gemmbatched", "GemmRoutine", "gemm_routine_2", "XGEMMBATCHED_MIN_INDIRECT_SIZE");
-  //TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmStridedBatchedRoutine<T, 30>,
-  //                       16, 128, 32, 30, num_runs,
-  //                       "gemmstridedbatched", "GemmRoutine", "gemm_routine_3", "XGEMMSTRIDEDBATCHED_MIN_INDIRECT_SIZE");
+  // Test for only one m/n/k size
+  if (arg_m != -1 || arg_n != -1 || arg_k != -1) {
+    printf("* Tuning for one specific size: m=%d, n=%d, k=%d\n", arg_m, arg_n, arg_k);
+    if (arg_m == -1 || arg_n == -1 || arg_k == -1) {
+      printf("* Error: If one of m/n/k specified, please specify all three\n");
+      return;
+    }
+    TuneGemmSingleSize<T>(platform, device, context, queue, static_cast<size_t>(arg_m),
+                          static_cast<size_t>(arg_n), static_cast<size_t>(arg_k), num_runs);
+  }
+
+  else {
+    // Run the tuners for the XGEMM routines
+    TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
+                           64, 2048, 64, 1, num_runs,
+                           "gemm", "GemmRoutine", "gemm_routine", "XGEMM_MIN_INDIRECT_SIZE");
+    //TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmBatchedRoutine<T, 30>,
+    //                       16, 128, 32, 30, num_runs,
+    //                       "gemmbatched", "GemmRoutine", "gemm_routine_2", "XGEMMBATCHED_MIN_INDIRECT_SIZE");
+    //TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmStridedBatchedRoutine<T, 30>,
+    //                       16, 128, 32, 30, num_runs,
+    //                       "gemmstridedbatched", "GemmRoutine", "gemm_routine_3", "XGEMMSTRIDEDBATCHED_MIN_INDIRECT_SIZE");
+  }
 
   printf("* Completed tuning process\n");
   printf("\n");
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index a29e531a..16a241af 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -122,6 +122,7 @@ constexpr auto kArgHelp = "h";
 constexpr auto kArgQuiet = "q";
 constexpr auto kArgNoAbbreviations = "no_abbrv";
 constexpr auto kArgNumRuns = "runs";
+constexpr auto kArgFullStatistics = "full_statistics";
 
 // The buffer names
 constexpr auto kBufVecX = "X";
@@ -245,6 +246,7 @@ struct Arguments {
   size_t num_steps = 0;
   size_t num_runs = 10;
   std::vector<std::string> tuner_files = {};
+  bool full_statistics = false;
   #ifdef CLBLAST_REF_CUBLAS
     void* cublas_handle; // cublasHandle_t
   #endif
diff --git a/test/performance/client.cpp b/test/performance/client.cpp
index 48690c3d..377e0140 100644
--- a/test/performance/client.cpp
+++ b/test/performance/client.cpp
@@ -17,6 +17,7 @@
 #include <algorithm>
 #include <chrono>
 #include <random>
+#include <tuning/tuning.hpp>
 
 #include "utilities/utilities.hpp"
 #include "test/performance/client.hpp"
@@ -145,6 +146,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le
   args.print_help     = CheckArgument(command_line_args, help, kArgHelp);
   args.silent         = CheckArgument(command_line_args, help, kArgQuiet);
   args.no_abbrv       = CheckArgument(command_line_args, help, kArgNoAbbreviations);
+  args.full_statistics= CheckArgument(command_line_args, help, kArgFullStatistics);
   warm_up_            = CheckArgument(command_line_args, help, kArgWarmUp);
 
   // Parse the optional JSON file name arguments
@@ -253,32 +255,32 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
     auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar};
 
     // Runs the routines and collects the timings
-    auto timings = std::vector<std::pair<std::string, double>>();
-    auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
-    timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
+    auto timings = std::vector<std::pair<std::string, TimeResult>>();
+    auto time_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
+    timings.push_back(std::pair<std::string, TimeResult>("CLBlast", time_clblast));
     if (args.compare_clblas) {
-      auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS");
-      timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
+      auto time_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS");
+      timings.push_back(std::pair<std::string, TimeResult>("clBLAS", time_clblas));
     }
     if (args.compare_cblas) {
       auto buffers_host = BuffersHost<T>();
       DeviceToHost(args, buffers, buffers_host, queue, buffers_in_);
-      auto ms_cblas = TimedExecution(args.num_runs, args, buffers_host, queue, run_reference2_, "CPU BLAS");
+      auto time_cblas = TimedExecution(args.num_runs, args, buffers_host, queue, run_reference2_, "CPU BLAS");
       HostToDevice(args, buffers, buffers_host, queue, buffers_out_);
-      timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas));
+      timings.push_back(std::pair<std::string, TimeResult>("CPU BLAS", time_cblas));
     }
     if (args.compare_cublas) {
       auto buffers_host = BuffersHost<T>();
       auto buffers_cuda = BuffersCUDA<T>();
       DeviceToHost(args, buffers, buffers_host, queue, buffers_in_);
       HostToCUDA(args, buffers_cuda, buffers_host, buffers_in_);
-      auto ms_cublas = 0.0;
+      TimeResult time_cublas;
       try {
-        ms_cublas = TimedExecution(args.num_runs, args, buffers_cuda, queue, run_reference3_, "cuBLAS");
+        time_cublas = TimedExecution(args.num_runs, args, buffers_cuda, queue, run_reference3_, "cuBLAS");
       } catch (std::runtime_error e) { }
       CUDAToHost(args, buffers_cuda, buffers_host, buffers_out_);
       HostToDevice(args, buffers, buffers_host, queue, buffers_out_);
-      timings.push_back(std::pair<std::string, double>("cuBLAS", ms_cublas));
+      timings.push_back(std::pair<std::string, TimeResult>("cuBLAS", time_cublas));
     }
 
     // Prints the performance of the tested libraries
@@ -311,9 +313,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
 // value found in the vector of timing results. The return value is in milliseconds.
 template <typename T, typename U>
 template <typename BufferType, typename RoutineType>
-double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
-                                   BufferType &buffers, Queue &queue,
-                                   RoutineType run_blas, const std::string &library_name) {
+typename Client<T,U>::TimeResult Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
+                                                             BufferType &buffers, Queue &queue,
+                                                             RoutineType run_blas, const std::string &library_name) {
   auto status = StatusCode::kSuccess;
 
   // Do an optional warm-up to omit compilation times and initialisations from the measurements
@@ -343,7 +345,19 @@ double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &ar
     auto elapsed_time = std::chrono::steady_clock::now() - start_time;
     timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
   }
-  return *std::min_element(timings.begin(), timings.end());
+
+  // Compute statistics
+  auto result = TimeResult();
+  const auto sum = std::accumulate(timings.begin(), timings.end(), 0.0);
+  const auto mean = sum / timings.size();
+  std::vector<double> diff(timings.size());
+  std::transform(timings.begin(), timings.end(), diff.begin(), [mean](double x) { return x - mean; });
+  const auto sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+  result.mean = mean;
+  result.standard_deviation = std::sqrt(sq_sum / timings.size());
+  result.minimum = *std::min_element(timings.begin(), timings.end());
+  result.maximum = *std::max_element(timings.begin(), timings.end());
+  return result;
 }
 
 // =================================================================================================
@@ -355,26 +369,42 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
   // First line (optional)
   if (!args.silent) {
     for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
-    fprintf(stdout, " | <--       CLBlast       -->");
-    if (args.compare_clblas) { fprintf(stdout, " | <--       clBLAS        -->"); }
-    if (args.compare_cblas) { fprintf(stdout, " | <--      CPU BLAS       -->"); }
-    if (args.compare_cublas) { fprintf(stdout, " | <--       cuBLAS        -->"); }
+    if (args.full_statistics) {
+      fprintf(stdout, " | <--            CLBlast            -->");
+      if (args.compare_clblas) { fprintf(stdout, " | <--            clBLAS             -->"); }
+      if (args.compare_cblas) { fprintf(stdout, " | <--           CPU BLAS            -->"); }
+      if (args.compare_cublas) { fprintf(stdout, " | <--            cuBLAS             -->"); }
+    }
+    else {
+      fprintf(stdout, " | <--       CLBlast       -->");
+      if (args.compare_clblas) { fprintf(stdout, " | <--       clBLAS        -->"); }
+      if (args.compare_cblas) { fprintf(stdout, " | <--      CPU BLAS       -->"); }
+      if (args.compare_cublas) { fprintf(stdout, " | <--       cuBLAS        -->"); }
+    }
     fprintf(stdout, " |\n");
   }
 
   // Second line
   for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
-  fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
-  if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
-  if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); }
-  if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_4", "GFLOPS_4", "GBs_4"); }
+  if (args.full_statistics) {
+    fprintf(stdout, "%9s;%9s;%9s;%9s", "min_ms_1", "max_ms_1", "mean_1", "stddev_1");
+    if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_2", "max_ms_2", "mean_2", "stddev_2"); }
+    if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_3", "max_ms_3", "mean_3", "stddev_3"); }
+    if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_4", "max_ms_4", "mean_4", "stddev_4"); }
+  }
+  else {
+    fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
+    if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
+    if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); }
+    if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_4", "GFLOPS_4", "GBs_4"); }
+  }
   fprintf(stdout, "\n");
 }
 
 // Print a performance-result row
 template <typename T, typename U>
 void Client<T,U>::PrintTableRow(const Arguments<U>& args,
-                                const std::vector<std::pair<std::string, double>>& timings) {
+                                const std::vector<std::pair<std::string, TimeResult>>& timings) {
 
   // Creates a vector of relevant variables
   auto integers = std::vector<size_t>{};
@@ -443,16 +473,26 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args,
 
   // Loops over all tested libraries
   for (const auto& timing : timings) {
+    const auto library_name = timing.first;
+    const auto minimum_ms = timing.second.minimum;
+    if (library_name != "CLBlast") { fprintf(stdout, ";"); }
+
+    // Either output full statistics
+    if (args.full_statistics) {
+      const auto maximum_ms = timing.second.maximum;
+      const auto mean_ms = timing.second.mean;
+      const auto standard_deviation = timing.second.standard_deviation;
+      fprintf(stdout, "%9.3lf;%9.3lf;%9.3lf;%9.3lf", minimum_ms, maximum_ms, mean_ms, standard_deviation);
+    }
 
-    // Computes the GFLOPS and GB/s metrics
-    auto flops = get_flops_(args);
-    auto bytes = get_bytes_(args);
-    auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0;
-    auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0;
-
-    // Outputs the performance numbers
-    if (timing.first != "CLBlast") { fprintf(stdout, ";"); }
-    fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", timing.second, gflops, gbs);
+    // ... or outputs minimum time and the GFLOPS and GB/s metrics
+    else {
+      const auto flops = get_flops_(args);
+      const auto bytes = get_bytes_(args);
+      const auto gflops = (minimum_ms != 0.0) ? (flops*1e-6)/minimum_ms : 0;
+      const auto gbs = (minimum_ms != 0.0) ? (bytes*1e-6)/minimum_ms : 0;
+      fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", minimum_ms, gflops, gbs);
+    }
   }
   fprintf(stdout, "\n");
 }
diff --git a/test/performance/client.hpp b/test/performance/client.hpp
index eb224976..0cec242f 100644
--- a/test/performance/client.hpp
+++ b/test/performance/client.hpp
@@ -42,6 +42,7 @@ template <typename T, typename U>
 class Client {
  public:
   static const int kSeed;
+  struct TimeResult { double minimum; double maximum; double mean; double standard_deviation; };
 
   // Shorthand for the routine-specific functions passed to the tester
   using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
@@ -72,15 +73,15 @@ class Client {
 
   // Runs a function a given number of times and returns the execution time of the shortest instance
   template <typename BufferType, typename RoutineType>
-  double TimedExecution(const size_t num_runs, const Arguments<U> &args, BufferType &buffers,
-                        Queue &queue, RoutineType run_blas, const std::string &library_name);
+  TimeResult TimedExecution(const size_t num_runs, const Arguments<U> &args, BufferType &buffers,
+                            Queue &queue, RoutineType run_blas, const std::string &library_name);
 
   // Prints the header of a performance-data table
   void PrintTableHeader(const Arguments<U>& args);
 
   // Prints a row of performance data, including results of two libraries
   void PrintTableRow(const Arguments<U>& args,
-                     const std::vector<std::pair<std::string, double>>& timings);
+                     const std::vector<std::pair<std::string, TimeResult>>& timings);
 
   // The routine-specific functions passed to the tester
   const Routine run_routine_;
diff --git a/test/wrapper_cblas.hpp b/test/wrapper_cblas.hpp
index 408f084b..a47ff725 100644
--- a/test/wrapper_cblas.hpp
+++ b/test/wrapper_cblas.hpp
@@ -17,7 +17,11 @@
 
 extern "C"
 {
-  #include <cblas.h>
+  #ifdef CLBLAST_REF_CBLAS_MKL
+    #include <mkl_cblas.h>
+  #else
+    #include <cblas.h>
+  #endif
 }
 
 #include "utilities/utilities.hpp"