113 files changed, 1878 insertions, 244 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 1995dc84..0d517869 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,14 @@
 
+Development version (next release)
+- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
+- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
+- Fixed a bug in the tests and samples related to waiting for an invalid event
+- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0)
+- Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS
+- Added an option to run tuned kernels multiple times to average execution times
+- Various minor fixes and enhancements
+- Added tuned parameters for various devices (see README)
+
 Version 0.9.0
 - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
 - Improved performance significantly of rotated GEMV computations
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b422743f..9b028448 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,7 +135,8 @@ endif()
 # ==================================================================================================
 
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
-set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv)
+set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
+            xgemm xgemm_direct xgemv)
 set(SAMPLE_PROGRAMS_CPP sgemm)
 set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
@@ -278,8 +279,9 @@ if(CLIENTS OR TESTS)
   set(REF_INCLUDES )
   set(REF_LIBRARIES )
   if(CLBLAS_FOUND)
+    find_package(Threads)
+    set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
     set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
-    set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
     if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
       add_definitions(" /DCLBLAST_REF_CLBLAS")
     else()
@@ -362,7 +364,7 @@ if(TESTS)
                 test/correctness/tester.cpp test/correctness/testblas.cpp)
     target_include_directories(test_correctness_common PUBLIC
                                $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
-                               ${clblast_SOURCE_DIR})
+                               ${clblast_SOURCE_DIR} ${REF_INCLUDES})
     set(TESTS_COMMON ${TESTS_COMMON} $<TARGET_OBJECTS:test_correctness_common>)
   endif()
 
diff --git a/LICENSE b/LICENSE
index ae43189f..75f63024 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,14 +1,21 @@
+MIT License
 
-Copyright (c) 2015 Cedric Nugteren
+Copyright (c) 2016 Cedric Nugteren
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
- http://www.apache.org/licenses/LICENSE-2.0
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 7f6a3d96..33282d8f 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ The pre-requisites for compilation of CLBlast are:
   - Clang 3.3 or newer
   - AppleClang 5.0 or newer
   - ICC 14.0 or newer
-  - MSVC (Visual Studio) 2015 or newer
+  - MSVC (Visual Studio) 2013 or newer
 * An OpenCL 1.1 or newer library, for example:
   - Apple OpenCL
   - NVIDIA CUDA SDK
@@ -90,6 +90,8 @@ Afterwards, any of CLBlast's routines can be called directly: there is no need t
 
     cmake -DSAMPLES=ON ..
 
+Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
+
 
 Using the tuners (optional)
 -------------
@@ -117,8 +119,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
   - Tahiti
 * Intel GPUs:
   - HD Graphics 530
-  - HD Graphics Haswell Ultrabook GT2 Mobile
   - HD Graphics 5500 BroadWell U-Processor GT2
+  - HD Graphics Haswell Ultrabook GT2 Mobile
+  - HD Graphics IvyBridge M GT2
   - HD Graphics Skylake ULT GT2
   - Iris
   - Iris Pro
@@ -134,7 +137,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
 
     cmake -DTUNERS=ON ..
 
-Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher).
+Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.5.0 or higher).
 
 Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
 
diff --git a/doc/performance/GeForce_GTX480/SAXPY.pdf b/doc/performance/GeForce_GTX480/SAXPY.pdf
deleted file mode 100644
index 6e1c8f5a..00000000
--- a/doc/performance/GeForce_GTX480/SAXPY.pdf
+++ /dev/null
diff --git a/doc/performance/GeForce_GTX480/SGEMM.pdf b/doc/performance/GeForce_GTX480/SGEMM.pdf
deleted file mode 100644
index f430f880..00000000
--- a/doc/performance/GeForce_GTX480/SGEMM.pdf
+++ /dev/null
diff --git a/doc/performance/GeForce_GTX480/SGEMV.pdf b/doc/performance/GeForce_GTX480/SGEMV.pdf
deleted file mode 100644
index 8cb57124..00000000
--- a/doc/performance/GeForce_GTX480/SGEMV.pdf
+++ /dev/null
diff --git a/doc/performance/GeForce_GTX480/SSYMM.pdf b/doc/performance/GeForce_GTX480/SSYMM.pdf
deleted file mode 100644
index ff5941ad..00000000
--- a/doc/performance/GeForce_GTX480/SSYMM.pdf
+++ /dev/null
diff --git a/doc/performance/GeForce_GTX750Ti/SAXPY.pdf b/doc/performance/GeForce_GTX750Ti/SAXPY.pdf
new file mode 100644
index 00000000..531baa79
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SAXPY.pdf
diff --git a/doc/performance/GeForce_GTX750Ti/SGEMM.pdf b/doc/performance/GeForce_GTX750Ti/SGEMM.pdf
new file mode 100644
index 00000000..dcd62929
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SGEMM.pdf
diff --git a/doc/performance/GeForce_GTX750Ti/SGEMV.pdf b/doc/performance/GeForce_GTX750Ti/SGEMV.pdf
new file mode 100644
index 00000000..a4c3efb3
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SGEMV.pdf
diff --git a/doc/performance/GeForce_GTX750Ti/SSYMM.pdf b/doc/performance/GeForce_GTX750Ti/SSYMM.pdf
new file mode 100644
index 00000000..43d97d24
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SSYMM.pdf
diff --git a/doc/performance/Intel_IrisPro/SAXPY.pdf b/doc/performance/Intel_IrisPro/SAXPY.pdf
index 3a51f306..8d639b24 100644
--- a/doc/performance/Intel_IrisPro/SAXPY.pdf
+++ b/doc/performance/Intel_IrisPro/SAXPY.pdf
diff --git a/doc/performance/Intel_IrisPro/SGEMM.pdf b/doc/performance/Intel_IrisPro/SGEMM.pdf
index 15f1714f..31725025 100644
--- a/doc/performance/Intel_IrisPro/SGEMM.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMM.pdf
diff --git a/doc/performance/Intel_IrisPro/SGEMV.pdf b/doc/performance/Intel_IrisPro/SGEMV.pdf
index e1660999..9ec120c4 100644
--- a/doc/performance/Intel_IrisPro/SGEMV.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMV.pdf
diff --git a/doc/performance/Radeon_M370X/SGEMM.pdf b/doc/performance/Radeon_M370X/SGEMM.pdf
index 5dca8f03..da5722f9 100644
--- a/doc/performance/Radeon_M370X/SGEMM.pdf
+++ b/doc/performance/Radeon_M370X/SGEMM.pdf
diff --git a/doc/performance/Radeon_M370X/SGEMV.pdf b/doc/performance/Radeon_M370X/SGEMV.pdf
index fa661249..513318bf 100644
--- a/doc/performance/Radeon_M370X/SGEMV.pdf
+++ b/doc/performance/Radeon_M370X/SGEMV.pdf
diff --git a/doc/performance/Radeon_M370X/SSYMM.pdf b/doc/performance/Radeon_M370X/SSYMM.pdf
index 852181d1..03efd198 100644
--- a/doc/performance/Radeon_M370X/SSYMM.pdf
+++ b/doc/performance/Radeon_M370X/SSYMM.pdf
diff --git a/include/clblast_half.h b/include/clblast_half.h
index 269a520e..05d96f9f 100644
--- a/include/clblast_half.h
+++ b/include/clblast_half.h
@@ -25,6 +25,11 @@
   #include <CL/opencl.h>
 #endif
 
+// MSVC 2013 doesn't fully support C99
+#ifdef _MSC_VER
+    #define inline __inline
+#endif
+
 // =================================================================================================
 
 // Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type,
diff --git a/samples/cache.c b/samples/cache.c
index a592824d..abc8ad4b 100644
--- a/samples/cache.c
+++ b/samples/cache.c
@@ -112,8 +112,10 @@ void run_example_routine(const cl_device_id device) {
                                    &queue, &event);
 
   // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
 
   // Retrieves the execution time
   clock_t diff = clock() - start;
diff --git a/samples/dgemv.c b/samples/dgemv.c
index c22c9f37..a15d649a 100644
--- a/samples/dgemv.c
+++ b/samples/dgemv.c
@@ -84,8 +84,10 @@ int main(void) {
                                    &queue, &event);
 
   // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
 
   // Example completed. See "clblast_c.h" for status codes (0 -> success).
   printf("Completed DGEMV with status %d\n", status);
diff --git a/samples/haxpy.c b/samples/haxpy.c
index d5b98e12..5bab3d42 100644
--- a/samples/haxpy.c
+++ b/samples/haxpy.c
@@ -77,8 +77,10 @@ int main(void) {
                                    &queue, &event);
 
   // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
 
   // Copies the result back to the host
   clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
diff --git a/samples/sasum.c b/samples/sasum.c
index 1518cc13..02f924b0 100644
--- a/samples/sasum.c
+++ b/samples/sasum.c
@@ -73,8 +73,10 @@ int main(void) {
                                    &queue, &event);
 
   // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
 
   // Copies the result back to the host
   clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
diff --git a/samples/sgemm.c b/samples/sgemm.c
index b4827777..583fc261 100644
--- a/samples/sgemm.c
+++ b/samples/sgemm.c
@@ -87,8 +87,10 @@ int main(void) {
                                    &queue, &event);
 
   // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
 
   // Example completed. See "clblast_c.h" for status codes (0 -> success).
   printf("Completed SGEMM with status %d\n", status);
diff --git a/samples/sgemm.cpp b/samples/sgemm.cpp
index a4b89968..401ecff8 100644
--- a/samples/sgemm.cpp
+++ b/samples/sgemm.cpp
@@ -95,8 +95,10 @@ int main() {
                               &queue_plain, &event);
 
   // Record the execution time
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == clblast::StatusCode::kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
   auto elapsed_time = std::chrono::steady_clock::now() - start_time;
   auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
 
diff --git a/scripts/database/database.py b/scripts/database/database.py
index f758a2b7..31f313da 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -18,7 +18,7 @@ import database.bests as bests
 import database.defaults as defaults
 
 # Server storing a copy of the database
-DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.json"
+DATABASE_SERVER_URL = "https://raw.githubusercontent.com/CNugteren/CLBlast-database/master/database.json"
 
 # OpenCL vendor names and their short name
 VENDOR_TRANSLATION_TABLE = {
diff --git a/scripts/graphs/common.r b/scripts/graphs/common.r
index cd68cf26..e5dad616 100644
--- a/scripts/graphs/common.r
+++ b/scripts/graphs/common.r
@@ -31,8 +31,12 @@ options("width"=170)
 
 # ==================================================================================================
 
-# Constants
-num_runs <- 4
+# Settings
+num_runs <- 5
+num_runs_short <- 50
+xtics_subset_threshold <- 100
+xtics_subset_stepsize <- 8
+
 devices <- c("-platform","-device")
 options_string <- "-q -no_abbrv -cblas 0"
 library_names <- c("CLBlast", "clBLAS")
@@ -66,11 +70,21 @@ main <- function(routine_name, precision, test_names, test_values,
   executable <- paste("./clblast_client_", routine_name, sep="")
 
   # Configures the outputfile
-  pdf(paste(display_name, ".pdf", sep=""), height=8, width=13)
-  par(mfrow=c(2, 3))
-  par(oma=c(0, 0, 0, 0))
-  par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
-  par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+  file_name <- paste(display_name, ".pdf", sep="")
+  if (length(test_names) == 6) {
+    pdf(file_name, height=8, width=13)
+    par(mfrow=c(2, 3))
+    par(oma=c(0, 0, 0, 0))
+    par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
+    par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+  }
+  else { # length(test_names) == 2
+    pdf(file_name, height=8, width=13)
+    par(mfrow=c(2, 1))
+    par(oma=c(0, 0, 0, 0))
+    par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
+    par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+  }
 
   # Loops over the test-cases
   for (test_id in 1:length(test_names)) {
@@ -169,7 +183,12 @@ plot_graph <- function(xdata, ydata, log_setting,
        main="", xlab="", ylab="",
        ylim=c(ymin, ymax), xlim=c(xmin, xmax), axes=F, "n")
   axis(side=2, las=2)
-  axis(side=1, at=xdata, labels=xtics, las=2)
+  if (length(xdata) > xtics_subset_threshold) {  # Too many indices to print, plot only every Nth
+    subset <- seq(from=1, to=length(xdata), by=xtics_subset_stepsize)
+    axis(side=1, at=xdata[subset], labels=xtics[subset], las=2)
+  } else {
+    axis(side=1, at=xdata, labels=xtics, las=2)
+  }
   title(xlab=xlabel, line=-1)
   title(ylab=ylabel, line=2)
   title(graph_title, line=-2)
diff --git a/scripts/graphs/xgemm_small.r b/scripts/graphs/xgemm_small.r
new file mode 100644
index 00000000..ef94ef20
--- /dev/null
+++ b/scripts/graphs/xgemm_small.r
@@ -0,0 +1,56 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for small sizes of Xgemm, testing the direct kernel
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xgemm"
+parameters <- c("-m","-n","-k","-layout","-transA","-transB",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "small matrices in steps of 16",
+  "small matrices in steps of 1"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c( 128,  128,  128, 102, 111, 111, 57, 16, num_runs_short, precision)),
+  list(c( 128,  128,  128, 102, 111, 111, 385, 1, num_runs_short, precision))
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "matrix sizes (m=n=k)",
+  "matrix sizes (m=n=k)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("m", ""),
+  c("m", "")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ==================================================================================================
+\ No newline at end of file
diff --git a/scripts/graphs/xsymm.r b/scripts/graphs/xsymm.r
index a65bb16f..89d137d2 100644
--- a/scripts/graphs/xsymm.r
+++ b/scripts/graphs/xsymm.r
@@ -35,32 +35,32 @@ test_names <- list(
 
 # Defines the test-cases
 test_values <- list(
-  list(c( 128,  128, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
+  list(c( 128,  128, 102, 141, 121, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 102, 141, 121, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 102, 141, 121, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 102, 141, 121, 16, 1, num_runs, precision)),
   list(
-    c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
+    c(1024, 1024, 101, 141, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 141, 122, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 142, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 142, 122, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 141, 122, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 142, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 142, 122, 1, 0, num_runs, precision)
   ),
   list(
-    c(   8,    8, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
+    c(   8,    8, 102, 141, 121, 1, 0, num_runs, precision),
+    c(  16,   16, 102, 141, 121, 1, 0, num_runs, precision),
+    c(  32,   32, 102, 141, 121, 1, 0, num_runs, precision),
+    c(  64,   64, 102, 141, 121, 1, 0, num_runs, precision),
+    c( 128,  128, 102, 141, 121, 1, 0, num_runs, precision),
+    c( 256,  256, 102, 141, 121, 1, 0, num_runs, precision),
+    c( 512,  512, 102, 141, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
+    c(2048, 2048, 102, 141, 121, 1, 0, num_runs, precision),
+    c(4096, 4096, 102, 141, 121, 1, 0, num_runs, precision),
+    c(8192, 8192, 102, 141, 121, 1, 0, num_runs, precision)
   )
 )
 
diff --git a/scripts/graphs/xsyrk.r b/scripts/graphs/xsyrk.r
index 4ab46c9f..754c93e2 100644
--- a/scripts/graphs/xsyrk.r
+++ b/scripts/graphs/xsyrk.r
@@ -35,32 +35,32 @@ test_names <- list(
 
 # Defines the test-cases
 test_values <- list(
-  list(c( 128,  128, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
+  list(c( 128,  128, 102, 121, 111, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 102, 121, 111, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 102, 121, 111, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 102, 121, 111, 16, 1, num_runs, precision)),
   list(
-    c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
+    c(1024, 1024, 101, 121, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 121, 112, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 122, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 122, 112, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 121, 112, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 122, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 122, 112, 1, 0, num_runs, precision)
   ),
   list(
-    c(   8,    8, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
+    c(   8,    8, 102, 121, 111, 1, 0, num_runs, precision),
+    c(  16,   16, 102, 121, 111, 1, 0, num_runs, precision),
+    c(  32,   32, 102, 121, 111, 1, 0, num_runs, precision),
+    c(  64,   64, 102, 121, 111, 1, 0, num_runs, precision),
+    c( 128,  128, 102, 121, 111, 1, 0, num_runs, precision),
+    c( 256,  256, 102, 121, 111, 1, 0, num_runs, precision),
+    c( 512,  512, 102, 121, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
+    c(2048, 2048, 102, 121, 111, 1, 0, num_runs, precision),
+    c(4096, 4096, 102, 121, 111, 1, 0, num_runs, precision),
+    c(8192, 8192, 102, 121, 111, 1, 0, num_runs, precision)
   )
 )
 
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index d57223dd..aaa76cb4 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -12,8 +12,8 @@
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
-// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore
-// contains the following header copyright notice:
+// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
+// therefore contains the following header copyright notice:
 //
 // =================================================================================================
 //
@@ -97,14 +97,12 @@ class Event {
   // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
   float GetElapsedTime() const {
     WaitForCompletion();
-    auto bytes = size_t{0};
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
-    auto time_start = size_t{0};
+    const auto bytes = sizeof(cl_ulong);
+    auto time_start = cl_ulong{0};
     clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
-    auto time_end = size_t{0};
+    auto time_end = cl_ulong{0};
     clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
-    return (time_end - time_start) * 1.0e-6f;
+    return static_cast<float>(time_end - time_start) * 1.0e-6f;
   }
 
   // Accessor to the private data-member
@@ -152,6 +150,17 @@ class Platform {
   cl_platform_id platform_;
 };
 
+// Retrieves a vector with all platforms
+inline std::vector<Platform> GetAllPlatforms() {
+  auto num_platforms = cl_uint{0};
+  CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
+  auto all_platforms = std::vector<Platform>();
+  for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
+    all_platforms.push_back(Platform(platform_id));
+  }
+  return all_platforms;
+}
+
 // =================================================================================================
 
 // C++11 version of 'cl_device_id'
@@ -201,8 +210,8 @@ class Device {
   std::vector<size_t> MaxWorkItemSizes() const {
     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
   }
-  cl_ulong LocalMemSize() const {
-    return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
+  unsigned long LocalMemSize() const {
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
   }
   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
   size_t CoreClock() const {
@@ -238,9 +247,11 @@ class Device {
   // Query for a specific type of device or brand
   bool IsCPU() const { return Type() == "CPU"; }
   bool IsGPU() const { return Type() == "GPU"; }
-  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." ||
+                              Vendor() == "AuthenticAMD";; }
   bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
-  bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
+  bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" ||
+                                Vendor() == "GenuineIntel"; }
   bool IsARM() const { return Vendor() == "ARM"; }
 
   // Accessor to the private data-member
@@ -606,8 +617,7 @@ class Buffer {
 
   // Retrieves the actual allocated size in bytes
   size_t GetSize() const {
-    auto bytes = size_t{0};
-    CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
+    const auto bytes = sizeof(size_t);
     auto result = size_t{0};
     CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
     return result;
@@ -658,17 +668,16 @@ class Kernel {
   }
 
   // Retrieves the amount of local memory used per work-group for this kernel
-  cl_ulong LocalMemUsage(const Device &device) const {
-    auto bytes = size_t{0};
+  unsigned long LocalMemUsage(const Device &device) const {
+    const auto bytes = sizeof(cl_ulong);
     auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
-    CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
     auto result = cl_ulong{0};
     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
-    return result;
+    return static_cast<unsigned long>(result);
   }
 
   // Retrieves the name of the kernel
-  std::string GetFunctionName() {
+  std::string GetFunctionName() const {
     auto bytes = size_t{0};
     CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
     auto result = std::string{};
@@ -689,6 +698,7 @@ class Kernel {
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event,
               const std::vector<Event> &waitForEvents) {
+
     // Builds a plain version of the events waiting list
     auto waitForEventsPlain = std::vector<cl_event>();
     for (auto &waitEvent : waitForEvents) {
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 34c44a29..1198cefb 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -21,10 +21,12 @@
 #include "database/kernels/xgemv_fast_rot.hpp"
 #include "database/kernels/xger.hpp"
 #include "database/kernels/xgemm.hpp"
+#include "database/kernels/xgemm_direct.hpp"
 #include "database/kernels/copy.hpp"
 #include "database/kernels/pad.hpp"
 #include "database/kernels/transpose.hpp"
 #include "database/kernels/padtranspose.hpp"
+#include "database/kernel_selection.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -38,10 +40,29 @@ const std::vector<Database::DatabaseEntry> Database::database = {
   XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
   XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
   XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
+  XgemmDirectHalf, XgemmDirectSingle, XgemmDirectDouble, XgemmDirectComplexSingle, XgemmDirectComplexDouble,
   CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
   PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
   TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
-  PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
+  PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble,
+  KernelSelectionHalf, KernelSelectionSingle, KernelSelectionDouble, KernelSelectionComplexSingle, KernelSelectionComplexDouble
+};
+
+// The OpenCL device types
+const std::string Database::kDeviceTypeCPU = "CPU";
+const std::string Database::kDeviceTypeGPU = "GPU";
+const std::string Database::kDeviceTypeAccelerator = "accelerator";
+const std::string Database::kDeviceTypeAll = "default";
+
+// The OpenCL device vendors
+const std::string Database::kDeviceVendorAll = "default";
+
+// Alternative names for some OpenCL vendors
+const std::unordered_map<std::string, std::string> Database::kVendorNames{
+  { "Intel(R) Corporation", "Intel" },
+  { "GenuineIntel", "Intel" },
+  { "Advanced Micro Devices, Inc.", "AMD" },
+  { "NVIDIA Corporation", "NVIDIA" },
 };
 
 // =================================================================================================
diff --git a/src/database/database.hpp b/src/database/database.hpp
index a6ab49c5..346fe089 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -36,36 +36,31 @@ class Database {
 
   // Structures for content inside the database
   struct DatabaseDevice {
-    const std::string name;
-    const Parameters parameters;
+    std::string name;
+    Parameters parameters;
   };
   struct DatabaseVendor {
-    const std::string type;
-    const std::string name;
-    const std::vector<DatabaseDevice> devices;
+    std::string type;
+    std::string name;
+    std::vector<DatabaseDevice> devices;
   };
   struct DatabaseEntry {
-    const std::string kernel;
-    const Precision precision;
-    const std::vector<DatabaseVendor> vendors;
+    std::string kernel;
+    Precision precision;
+    std::vector<DatabaseVendor> vendors;
   };
 
   // The OpenCL device types
-  static constexpr auto kDeviceTypeCPU = "CPU";
-  static constexpr auto kDeviceTypeGPU = "GPU";
-  static constexpr auto kDeviceTypeAccelerator = "accelerator";
-  static constexpr auto kDeviceTypeAll = "default";
+  static const std::string kDeviceTypeCPU;
+  static const std::string kDeviceTypeGPU;
+  static const std::string kDeviceTypeAccelerator;
+  static const std::string kDeviceTypeAll;
 
   // The OpenCL device vendors
-  static constexpr auto kDeviceVendorAll = "default";
+  static const std::string kDeviceVendorAll;
 
   // Alternative names for some OpenCL vendors
-  const std::unordered_map<std::string,std::string> kVendorNames {
-    {"Intel(R) Corporation", "Intel"},
-    {"GenuineIntel", "Intel"},
-    {"Advanced Micro Devices, Inc.", "AMD"},
-    {"NVIDIA Corporation", "NVIDIA"},
-  };
+  static const std::unordered_map<std::string, std::string> kVendorNames;
 
   // The database consists of separate database entries, stored together in a vector
   static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
@@ -75,10 +70,12 @@ class Database {
   static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
   static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
   static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
+  static const DatabaseEntry XgemmDirectHalf, XgemmDirectSingle, XgemmDirectDouble, XgemmDirectComplexSingle, XgemmDirectComplexDouble;
   static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
   static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
   static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
   static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
+  static const DatabaseEntry KernelSelectionHalf, KernelSelectionSingle, KernelSelectionDouble, KernelSelectionComplexSingle, KernelSelectionComplexDouble;
   static const std::vector<DatabaseEntry> database;
 
   // The constructor with a user-provided database overlay (potentially an empty vector)
diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp
new file mode 100644
index 00000000..c9462c7a
--- /dev/null
+++ b/src/database/kernel_selection.hpp
@@ -0,0 +1,129 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel
+// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the
+// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use
+// some common default values.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::KernelSelectionHalf = {
+  "KernelSelection", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::KernelSelectionSingle = {
+  "KernelSelection", Precision::kSingle, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::KernelSelectionComplexSingle = {
+  "KernelSelection", Precision::kComplexSingle, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::KernelSelectionDouble = {
+  "KernelSelection", Precision::kDouble, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::KernelSelectionComplexDouble = {
+  "KernelSelection", Precision::kComplexDouble, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index a6b7dfe8..479c7f78 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::CopySingle = {
         { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
@@ -84,7 +85,7 @@ const Database::DatabaseEntry Database::CopySingle = {
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@@ -128,6 +129,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
@@ -147,7 +149,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
         { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
@@ -205,13 +207,13 @@ const Database::DatabaseEntry Database::CopyDouble = {
         { "GeForce GTX 670",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
     { // Default
@@ -264,7 +266,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index 3cfabaf4..48085139 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::PadSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris Pro",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
@@ -84,7 +85,7 @@ const Database::DatabaseEntry Database::PadSingle = {
         { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -134,10 +135,11 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris Pro",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
     { // Intel accelerators
@@ -154,13 +156,13 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
         { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
@@ -272,7 +274,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
         { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index 88bd4ea7..f9448985 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris",                                            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -134,6 +135,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Iris",                                            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -272,7 +274,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
         { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 0e1b608e..191d2e98 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
         { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@@ -134,6 +135,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
@@ -159,7 +161,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
       }
     },
   }
@@ -207,7 +209,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
         { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 750",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 9c1bcd99..70be5abc 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -64,6 +64,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",1}, {"WGS",512}, {"WPT",2} } },
         { "Iris",                                            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",128}, {"WPT",2} } },
@@ -84,7 +85,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
         { "GeForce GTX 670",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",4}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -134,6 +135,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW",4}, {"WGS",64}, {"WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",2}, {"WGS",512}, {"WPT",1} } },
         { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",256}, {"WPT",8} } },
@@ -213,7 +215,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
         { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
@@ -272,7 +274,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
         { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",2} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",64}, {"WPT",4} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 987a990d..96a699aa 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -55,6 +55,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WGS2",32} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",512}, {"WGS2",128} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",512}, {"WGS2",64} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
@@ -68,6 +69,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",1024} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",128} } },
         { "GeForce GTX 750",                                 { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",1024}, {"WGS2",32} } },
@@ -106,6 +108,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",512}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",32}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
@@ -119,6 +122,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
@@ -160,6 +164,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",256} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
@@ -201,6 +206,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",128} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",128}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",128}, {"WGS2",32} } },
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index d19c55b5..ffe3dc57 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -57,9 +57,10 @@ const Database::DatabaseEntry Database::XgemmSingle = {
         { "Intel(R) HD Graphics 530",                        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
         { "Iris",                                            { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
-        { "Iris Pro",                                        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
@@ -77,7 +78,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
         { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
         { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",2} } },
         { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
@@ -127,6 +128,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
         { "Iris",                                            { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
@@ -147,7 +149,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
         { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
@@ -206,7 +208,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
         { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
         { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -265,7 +267,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
         { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp
new file mode 100644
index 00000000..53dfeaa6
--- /dev/null
+++ b/src/database/kernels/xgemm_direct.hpp
@@ -0,0 +1,136 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemm_Direct' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDirectHalf = {
+  "XgemmDirect", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDirectSingle = {
+  "XgemmDirect", Precision::kSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Iris Pro",                                        { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDirectComplexSingle = {
+  "XgemmDirect", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Iris Pro",                                        { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDirectDouble = {
+  "XgemmDirect", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDirectComplexDouble = {
+  "XgemmDirect", Precision::kComplexDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index e5e8845e..be6606a6 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -57,6 +57,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WPT1",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
         { "Iris",                                            { {"WGS1",64}, {"WPT1",2} } },
         { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2} } },
@@ -77,7 +78,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",256}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1} } },
@@ -120,6 +121,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
         { "Iris",                                            { {"WGS1",256}, {"WPT1",1} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WPT1",1} } },
@@ -140,7 +142,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",128}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
@@ -188,7 +190,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1} } },
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 52af628c..cd430dcb 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -57,10 +57,11 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
         { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
-        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
       }
     },
     { // Intel accelerators
@@ -77,7 +78,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
         { "GeForce GTX 670",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
         { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 750",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
         { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@@ -120,6 +121,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris Pro",                                        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
@@ -139,7 +141,6 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
         { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
       }
     },
@@ -186,7 +187,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
         { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 750",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
         { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 328094e1..2dd7db32 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -44,6 +44,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
         { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
@@ -51,8 +52,9 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
         { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
       }
     },
     { // Default
@@ -83,6 +85,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
         { "default",                                         { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } },
@@ -114,8 +117,9 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
         { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
       }
     },
     { // Default
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 3e9c25c1..7816080f 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -63,6 +63,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
         { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
@@ -76,8 +77,9 @@ const Database::DatabaseEntry Database::XgerSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
       }
     },
     { // Default
@@ -120,6 +122,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
@@ -133,6 +136,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 750",                                 { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
       }
@@ -180,8 +184,9 @@ const Database::DatabaseEntry Database::XgerDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
       }
     },
     { // Default
@@ -227,6 +232,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "default",                                         { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
       }
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 223501fd..b0817242 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -204,7 +204,7 @@ R"(
 #if PRECISION == 3232 || PRECISION == 6464
   #define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y
 #else
-  #define COMPLEX_CONJUGATE(value) value = value
+  #define COMPLEX_CONJUGATE(value) 
 #endif
 
 // =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl
new file mode 100644
index 00000000..a8bd450e
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part1.opencl
@@ -0,0 +1,273 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any
+// pre and and post-processing kernels.
+//
+// This kernel is seperated into three files. This is part 1 out of 3.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library. Note that all parameters here have a
+// suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel.
+#ifndef WGD
+  #define WGD 8      // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64)
+#endif
+#ifndef MDIMCD
+  #define MDIMCD 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef NDIMCD
+  #define NDIMCD 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef MDIMAD
+  #define MDIMAD 8    // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#endif
+#ifndef NDIMBD
+  #define NDIMBD 8    // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#endif
+#ifndef KWID
+  #define KWID 1      // Unroll factor of the WGD loop (smaller or equal than WGD)
+#endif
+#ifndef VWMD
+  #define VWMD 1      // Vector width of matrices A and C
+#endif
+#ifndef VWND
+  #define VWND 1      // Vector width of matrix B
+#endif
+#ifndef PADA
+  #define PADA 1      // Local memory padding for matrix A
+#endif
+#ifndef PADB
+  #define PADB 1      // Local memory padding for matrix B
+#endif
+
+// Helper parameters based on the above tuning parameters
+#define MWID (WGD/MDIMCD)                // Work per work-item (M-dimension)
+#define NWID (WGD/NDIMCD)                // Work per work-item (N-dimension)
+#define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#define MWAD (WGD/MDIMAD)                // Amount of loads-per-thread for matrix A (M-dimension)
+#define KWAD (WGD/KDIMAD)                // Amount of loads-per-thread for matrix A (K-dimension)
+#define KWBD (WGD/KDIMBD)                // Amount of loads-per-thread for matrix B (K-dimension)
+#define NWBD (WGD/NDIMBD)                // Amount of loads-per-thread for matrix B (N-dimension)
+
+// =================================================================================================
+
+// Data-widths in dimension M
+#if VWMD == 1
+    typedef real realMD;
+#elif VWMD == 2
+    typedef real2 realMD;
+#elif VWMD == 4
+    typedef real4 realMD;
+#elif VWMD == 8
+    typedef real8 realMD;
+#elif VWMD == 16
+    typedef real16 realMD;
+#endif
+
+// Data-widths in dimension N
+#if VWND == 1
+    typedef real realND;
+#elif VWND == 2
+    typedef real2 realND;
+#elif VWND == 4
+    typedef real4 realND;
+#elif VWND == 8
+    typedef real8 realND;
+#elif VWND == 16
+    typedef real16 realND;
+#endif
+
+// =================================================================================================
+
+// Initializes the accumulation registers to zero
+inline void InitAccRegistersDirect(real cpm[NWID][MWID]) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    #pragma unroll
+    for (int ni=0; ni<NWID; ++ni) {
+      SetToZero(cpm[ni][mi]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Performs the actual computation: Cpm += Apm * Bpm
+inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+      MultiplyAdd(cpm[ni][mi], apm[mi], bpm[ni]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix.
+inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID],
+                                   const int a_ld, const int a_offset, const int idm, const int idk,
+                                   const int a_transpose, const int a_conjugate) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+    apm[mi] = agms[a_index + a_offset];
+    if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID],
+                                   const int b_ld, const int b_offset, const int idn, const int idk,
+                                   const int b_transpose, const int b_conjugate) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+    bpm[ni] = bgms[b_index + b_offset];
+    if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+  }
+}
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix. This is the same as above but now includes a bounds check.
+inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID],
+                                    const int a_ld, const int a_offset, const int idm, const int idk,
+                                    const int a_transpose, const int a_conjugate,
+                                    const int kSizeM) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    if (idm + mi < kSizeM) {
+      const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+      apm[mi] = agms[a_index + a_offset];
+      if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+    }
+    else {
+      SetToZero(apm[mi]);
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID],
+                                    const int b_ld, const int b_offset, const int idn, const int idk,
+                                    const int b_transpose, const int b_conjugate,
+                                    const int kSizeN) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    if (idn + ni < kSizeN) {
+      const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+      bpm[ni] = bgms[b_index + b_offset];
+      if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+    }
+    else {
+      SetToZero(bpm[ni]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches on-chip local memory into per-thread private memory (registers). This function is specific
+// for caching the A input matrix.
+inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
+                                  const int a_transpose) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    const int mg = mi + get_local_id(0)*MWID;
+    const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg;
+    apm[mi] = alm[index];
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
+                                  const int b_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    const int ng = ni + get_local_id(1)*NWID;
+    const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng;
+    bpm[ni] = blm[index];
+  }
+}
+
+// =================================================================================================
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
+                               const int idm, const int idn,
+                               const real alpha, const real beta,
+                               const int c_ld, const int c_offset, const int c_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+
+      // Determines the destination index
+      int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+      // The final multiplication with alpha (in case beta == 0)
+      real result;
+      if (IsZero(beta)) {
+        Multiply(result, alpha, cpm[ni][mi]);
+      }
+      // The final multiplication with alpha and the addition with beta*C
+      else {
+        AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+      }
+      cgm[c_index + c_offset] = result;
+    }
+  }
+}
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID],
+                                const int idm, const int idn, const int kSizeM, const int kSizeN,
+                                const real alpha, const real beta,
+                                const int c_ld, const int c_offset, const int c_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+      if ((idm + mi) < kSizeM && (idn + ni) < kSizeN) {
+
+        // Determines the destination index
+        int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+        // The final multiplication with alpha (in case beta == 0)
+        real result;
+        if (IsZero(beta)) {
+          Multiply(result, alpha, cpm[ni][mi]);
+        }
+        // The final multiplication with alpha and the addition with beta*C
+        else {
+          AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+        }
+        cgm[c_index + c_offset] = result;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
new file mode 100644
index 00000000..d77cbf65
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -0,0 +1,314 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix.
+inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
+                                 const int a_ld, const int a_offset, const int kwg,
+                                 const int a_transpose, const int a_conjugate) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD/VWMD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*(MWAD/VWMD);
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(WGD/VWMD);
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
+      #if VWMD == 1
+         alm[kg*(WGD + PADA) + mg] = avec;
+      #elif VWMD == 2
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+      #elif VWMD == 4
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.z;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.w;
+      #elif VWMD == 8
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+         alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+         alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+         alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+         alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+      #elif VWMD == 16
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+         alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+         alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+         alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+         alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+         alm[kg*(WGD + PADA) + mg*VWMD + 8] = avec.s8;
+         alm[kg*(WGD + PADA) + mg*VWMD + 9] = avec.s9;
+         alm[kg*(WGD + PADA) + mg*VWMD + 10] = avec.sA;
+         alm[kg*(WGD + PADA) + mg*VWMD + 11] = avec.sB;
+         alm[kg*(WGD + PADA) + mg*VWMD + 12] = avec.sC;
+         alm[kg*(WGD + PADA) + mg*VWMD + 13] = avec.sD;
+         alm[kg*(WGD + PADA) + mg*VWMD + 14] = avec.sE;
+         alm[kg*(WGD + PADA) + mg*VWMD + 15] = avec.sF;
+      #endif
+      if (a_conjugate) {
+        for (int vm=0; vm<VWMD; ++vm) {
+          COMPLEX_CONJUGATE(alm[kg*(WGD + PADA) + mg*VWMD + vm]);
+        }
+      }
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
+                                 const int b_ld, const int b_offset, const int kwg,
+                                 const int b_transpose, const int b_conjugate) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD/VWND; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*(NWBD/VWND);
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg/VWND : ng + GetGroupID1()*(WGD/VWND);
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
+      #if VWND == 1
+         blm[kg*(WGD + PADB) + ng] = bvec;
+      #elif VWND == 2
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+      #elif VWND == 4
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.z;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.w;
+      #elif VWND == 8
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+         blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+         blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+         blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+         blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+      #elif VWND == 16
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+         blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+         blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+         blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+         blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+         blm[kg*(WGD + PADB) + ng*VWND + 8] = bvec.s8;
+         blm[kg*(WGD + PADB) + ng*VWND + 9] = bvec.s9;
+         blm[kg*(WGD + PADB) + ng*VWND + 10] = bvec.sA;
+         blm[kg*(WGD + PADB) + ng*VWND + 11] = bvec.sB;
+         blm[kg*(WGD + PADB) + ng*VWND + 12] = bvec.sC;
+         blm[kg*(WGD + PADB) + ng*VWND + 13] = bvec.sD;
+         blm[kg*(WGD + PADB) + ng*VWND + 14] = bvec.sE;
+         blm[kg*(WGD + PADB) + ng*VWND + 15] = bvec.sF;
+      #endif
+      if (b_conjugate) {
+        for (int vn=0; vn<VWND; ++vn) {
+          COMPLEX_CONJUGATE(blm[kg*(WGD + PADB) + ng*VWND + vn]);
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs doesn't
+// use the vector data-types.
+inline void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
+                                 const int a_ld, const int a_offset, const int kwg,
+                                 const int a_transpose, const int a_conjugate) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*MWAD;
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      real result = agms[idk*a_ld + idm + a_offset];
+      if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+      alm[kg*(WGD + PADA) + mg] = result;
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
+                                 const int b_ld, const int b_offset, const int kwg,
+                                 const int b_transpose, const int b_conjugate) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*NWBD;
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      real result = bgms[idk*b_ld + idn + b_offset];
+      if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+      blm[kg*(WGD + PADB) + ng] = result;
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs bounds
+// checks and doesn't use the vector data-types.
+inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
+                                  const int a_ld, const int a_offset, const int kwg,
+                                  const int a_transpose, const int a_conjugate,
+                                  const int kSizeM, const int kSizeK) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*MWAD;
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      int condition = (a_transpose) ? idm < kSizeK : idm < kSizeM;
+      if (condition) {
+        real result = agms[idk*a_ld + idm + a_offset];
+        if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+        alm[kg*(WGD + PADA) + mg] = result;
+      }
+      else {
+        SetToZero(alm[kg*(WGD + PADA) + mg]);
+      }
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
+                                  const int b_ld, const int b_offset, const int kwg,
+                                  const int b_transpose, const int b_conjugate,
+                                  const int kSizeN, const int kSizeK) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*NWBD;
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      int condition = (b_transpose) ? idn < kSizeK : idn < kSizeN;
+      if (condition) {
+        real result = bgms[idk*b_ld + idn + b_offset];
+        if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+        blm[kg*(WGD + PADB) + ng] = result;
+      }
+      else {
+        SetToZero(blm[kg*(WGD + PADB) + ng]);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
new file mode 100644
index 00000000..a9350e00
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -0,0 +1,214 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main body of the kernel. This is the direct version without pre/post processing and restrictions.
+inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
+                        const real_arg arg_alpha,
+                        const real_arg arg_beta,
+                        const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                        const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                        __global real* cgm, const int c_offset, const int c_ld,
+                        __local real* alm, __local real* blm,
+                        const int a_transpose, const int b_transpose, const int c_transpose,
+                        const int a_conjugate, const int b_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Extra pointers to scalar versions of global memory
+  const __global real* restrict agms = (const __global real* restrict) agm;
+  const __global real* restrict bgms = (const __global real* restrict) bgm;
+
+  // Allocates workitem-private memory (registers)
+  real apm[MWID];
+  real bpm[NWID];
+  real cpm[NWID][MWID];
+
+  // Initializes the accumulation registers
+  InitAccRegistersDirect(cpm);
+
+  // The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section
+  // processes only the main parts: output blocks of WGD by WGD.
+  const int idm = get_local_id(0) * MWID + GetGroupID0() * WGD;
+  const int idn = get_local_id(1) * NWID + GetGroupID1() * WGD;
+  if ((idm < (kSizeM/WGD)*WGD) && (idn < (kSizeN/WGD)*WGD)) {
+
+    // Loops over all complete workgroup tiles (K-dimension)
+    int kwg = 0;
+    for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+      // Loads data: off-chip --> local (matrix A and B)
+      if (a_ld % VWMD == 0) {
+        GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+      }
+      else {
+        GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+      }
+      if (b_ld % VWND == 0) {
+        GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+      }
+      else {
+        GlobalToLocalScalarB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      // Loops over all workitem tiles, unrolled by a factor KWID
+      for (int pwi=0; pwi<WGD; pwi+=KWID) {
+        #pragma unroll
+        for (int pit=0; pit<KWID; ++pit) {
+          int kg = pwi + pit;
+
+          // Loads data: local --> private (matrix A and B)
+          LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+          LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+          // Performs the accumulation (Cpm += Apm * Bpm)
+          MultiplyAccumulateDirect(cpm, apm, bpm);
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Loop over the remaining part (incomplete tile in K-dimension)
+    for (; kwg < kSizeK; ++kwg) {
+
+      // Loads data: off-chip --> private (matrix A and B)
+      GlobalToPrivateDirectA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate);
+      GlobalToPrivateDirectB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate);
+
+      // Performs the accumulation (Cpm += Apm * Bpm)
+      MultiplyAccumulateDirect(cpm, apm, bpm);
+    }
+
+    // Stores a tile of results and performs the multiplication with alpha and beta
+    StoreResultsDirect(cgm, cpm, idm, idn, alpha, beta, c_ld, c_offset, c_transpose);
+  }
+
+  // Simple but slower version for the parts on the edge (incomplete tiles in M and N-dimensions)
+  else {
+
+    // Loops over all complete workgroup tiles (K-dimension)
+    int kwg = 0;
+    for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+      // Loads data: off-chip --> local (matrix A and B)
+      GlobalToLocalCheckedA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate, kSizeM, kSizeK);
+      GlobalToLocalCheckedB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate, kSizeN, kSizeK);
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      // Loops over all workitem tiles, unrolled by a factor KWID
+      for (int pwi=0; pwi<WGD; pwi+=KWID) {
+        #pragma unroll
+        for (int pit=0; pit<KWID; ++pit) {
+          int kg = pwi + pit;
+
+          // Loads data: local --> private (matrix A and B)
+          LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+          LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+          // Performs the accumulation (Cpm += Apm * Bpm)
+          MultiplyAccumulateDirect(cpm, apm, bpm);
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Loop over the remaining part (incomplete tile in K-dimension)
+    for (; kwg < kSizeK; ++kwg) {
+
+      // Loads data: off-chip --> private (matrix A and B)
+      GlobalToPrivateCheckedA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM);
+      GlobalToPrivateCheckedB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN);
+
+      // Performs the accumulation (Cpm += Apm * Bpm)
+      MultiplyAccumulateDirect(cpm, apm, bpm);
+    }
+
+    // Stores a tile of results and performs the multiplication with alpha and beta
+    StoreResultsChecked(cgm, cpm, idm, idn, kSizeM, kSizeN, alpha, beta, c_ld, c_offset, c_transpose);
+  }
+}
+
+// =================================================================================================
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/msvc.hpp b/src/msvc.hpp
new file mode 100644
index 00000000..a45105df
--- /dev/null
+++ b/src/msvc.hpp
@@ -0,0 +1,39 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides macro's and definitions to make compilation work on Microsoft Visual Studio,
+// in particular for versions older than 2015 with limited C++11 support.
+// MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015)
+// MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013)
+// MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012)
+// MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010)
+// MSVC++ 9.0  _MSC_VER == 1500 (Visual Studio 2008)
+//
+// =================================================================================================
+
+#ifndef CLBLAST_MSVC_HPP_
+#define CLBLAST_MSVC_HPP_
+
+namespace clblast {
+// =================================================================================================
+#ifdef _MSC_VER
+
+// No support for constexpr prior to 2015. Note that this only works with constants, not with
+// constexpr functions (unused in this project).
+#if _MSC_VER < 1900
+#define constexpr const
+#endif
+
+// _MSC_VER
+#endif
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_MSVC_HPP_
+#endif
diff --git a/src/routine.cpp b/src/routine.cpp
index 189ae190..d938d66f 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -14,6 +14,7 @@
 #include <string>
 #include <vector>
 #include <chrono>
+#include <cstdlib>
 
 #include "routine.hpp"
 
@@ -42,13 +43,19 @@ StatusCode Routine::SetUp() {
   // Queries the cache to see whether or not the program (context-specific) is already there
   if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
 
+  // Sets the build options from an environmental variable (if set)
+  auto options = std::vector<std::string>();
+  const auto environment_variable = std::getenv("CLBLAST_BUILD_OPTIONS");
+  if (environment_variable != nullptr) {
+    options.push_back(std::string(environment_variable));
+  }
+
   // Queries the cache to see whether or not the binary (device-specific) is already there. If it
   // is, a program is created and stored in the cache
   if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
     try {
       auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
       auto program = Program(device_, context_, binary);
-      auto options = std::vector<std::string>();
       program.Build(device_, options);
       StoreProgramToCache(program, context_, precision_, routine_name_);
     } catch (...) { return StatusCode::kBuildProgramFailure; }
@@ -115,7 +122,6 @@ StatusCode Routine::SetUp() {
   // Compiles the kernel
   try {
     auto program = Program(context_, source_string);
-    auto options = std::vector<std::string>();
     const auto build_status = program.Build(device_, options);
 
     // Checks for compiler crashes/errors/warnings
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 0b8e768f..1602c69f 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -22,7 +22,9 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
+    Routine(queue, event, name,
+            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+            PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level3/level3.opencl"
     #include "../../kernels/level3/copy_fast.opencl"
@@ -32,10 +34,16 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/convert_symmetric.opencl"
     #include "../../kernels/level3/convert_triangular.opencl"
     #include "../../kernels/level3/convert_hermitian.opencl"
+    #include "../../kernels/level3/xgemm_direct_part1.opencl"
+    #include "../../kernels/level3/xgemm_direct_part2.opencl"
+    #include "../../kernels/level3/xgemm_direct_part3.opencl"
+  ;
+  auto source_string_part_2 = // separated in two parts to prevent C1091 in MSVC 2013
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
     #include "../../kernels/level3/xgemm_part3.opencl"
   ;
+  source_string_ += source_string_part_2;
 }
 
 // =================================================================================================
@@ -98,6 +106,44 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
   if (ErrorIn(status)) { return status; }
 
+  // Selects which version of GEMM to run
+  const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
+  if (do_gemm_direct) { // for small sizes (single kernel)
+    return GemmDirect(m, n, k, alpha,
+                      a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+                      c_buffer, c_offset, c_ld,
+                      a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
+  }
+  else { // for larger sizes (pre/post-processing plus a very fast kernel)
+    return GemmIndirect(m, n, k, alpha,
+                        a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+                        c_buffer, c_offset, c_ld,
+                        a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
+                        a_one, a_two, a_want_rotated,
+                        b_one, b_two, b_want_rotated,
+                        c_one, c_two, c_want_rotated);
+  }
+}
+
+// =================================================================================================
+
+// The indirect version of GEMM. This uses the faster but non-general kernel. It has specific
+// requirements, but several pre and post-processing kernels take care of those. However, the
+// overhead of these extra kernels might not be ideal for certain devices/arguments.
+template <typename T>
+StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
+                                  const T alpha,
+                                  const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                                  const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                                  const T beta,
+                                  const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                                  const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                                  const bool a_conjugate, const bool b_conjugate,
+                                  const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                                  const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                                  const size_t c_one, const size_t c_two, const bool c_want_rotated) {
+  auto status = StatusCode::kSuccess;
+
   // Calculates the ceiled versions of m, n, and k
   const auto m_ceiled = Ceil(m, db_["MWG"]);
   const auto n_ceiled = Ceil(n, db_["NWG"]);
@@ -217,6 +263,66 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   } catch (...) { return StatusCode::kTempBufferAllocFailure; }
 }
 
+
+// =================================================================================================
+
+// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
+template <typename T>
+StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
+                                const T alpha,
+                                const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                                const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                                const T beta,
+                                const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                                const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                                const bool a_conjugate, const bool b_conjugate) {
+
+  // Loads the program from the database
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+  // Retrieves the proper XgemmDirect kernel from the compiled binary
+  try {
+    const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
+                                         (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
+    auto kernel = Kernel(program, name);
+
+    // Sets the kernel arguments
+    kernel.SetArgument(0, static_cast<int>(m));
+    kernel.SetArgument(1, static_cast<int>(n));
+    kernel.SetArgument(2, static_cast<int>(k));
+    kernel.SetArgument(3, GetRealArg(alpha));
+    kernel.SetArgument(4, GetRealArg(beta));
+    kernel.SetArgument(5, a_buffer());
+    kernel.SetArgument(6, static_cast<int>(a_offset));
+    kernel.SetArgument(7, static_cast<int>(a_ld));
+    kernel.SetArgument(8, b_buffer());
+    kernel.SetArgument(9, static_cast<int>(b_offset));
+    kernel.SetArgument(10, static_cast<int>(b_ld));
+    kernel.SetArgument(11, c_buffer());
+    kernel.SetArgument(12, static_cast<int>(c_offset));
+    kernel.SetArgument(13, static_cast<int>(c_ld));
+    kernel.SetArgument(14, static_cast<int>(c_do_transpose));
+    kernel.SetArgument(15, static_cast<int>(a_conjugate));
+    kernel.SetArgument(16, static_cast<int>(b_conjugate));
+
+    // Computes the global and local thread sizes
+    const auto m_ceiled = Ceil(m, db_["WGD"]);
+    const auto n_ceiled = Ceil(n, db_["WGD"]);
+    const auto global = std::vector<size_t>{
+      (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
+      (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
+    };
+    const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
+
+    // Launches the kernel
+    auto status = RunKernel(kernel, queue_, device_, global, local, event_);
+    if (ErrorIn(status)) { return status; }
+
+    // Successfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
 // =================================================================================================
 
 // Compiles the templated class
diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp
index bc51c7f5..46e12453 100644
--- a/src/routines/level3/xgemm.hpp
+++ b/src/routines/level3/xgemm.hpp
@@ -35,6 +35,29 @@ class Xgemm: public Routine {
                     const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
                     const T beta,
                     const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+
+  // Indirect version of GEMM (with pre and post-processing kernels)
+  StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k,
+                          const T alpha,
+                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                          const T beta,
+                          const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                          const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                          const bool a_conjugate, const bool b_conjugate,
+                          const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                          const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                          const size_t c_one, const size_t c_two, const bool c_want_rotated);
+
+  // Direct version of GEMM (no pre and post-processing kernels)
+  StatusCode GemmDirect(const size_t m, const size_t n, const size_t k,
+                        const T alpha,
+                        const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                        const T beta,
+                        const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                        const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                        const bool a_conjugate, const bool b_conjugate);
 };
 
 // =================================================================================================
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 78ded56e..c57aab39 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -47,6 +47,7 @@ class TuneCopy {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 90f5ea82..9486ee8d 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -47,6 +47,7 @@ class TunePad {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 10fa80cb..2d9d5e49 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -47,6 +47,7 @@ class TuneTranspose {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index 507718eb..d364dabe 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -47,6 +47,7 @@ class TunePadTranspose {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 0033b3c6..403ee9e4 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -51,6 +51,7 @@ class TuneXaxpy {
   static size_t DefaultN() { return 4096*1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp
index 1581e13f..f8416761 100644
--- a/src/tuning/kernels/xdot.cpp
+++ b/src/tuning/kernels/xdot.cpp
@@ -47,6 +47,7 @@ class TuneXdot {
   static size_t DefaultN() { return 2*1024*1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 4cb7fd00..0eb1875b 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -52,6 +52,7 @@ class TuneXgemm {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1024; }
   static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -126,10 +127,10 @@ class TuneXgemm {
   // Sets the local memory size
   static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
     auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision));
+      return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision));
     };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM",
-                                                    "SB", "KWG", "NWG", "VWN"});
+    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG",
+                                                    "SB", "KWG", "NWG"});
   }
 
   // Sets the base thread configuration
diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp
new file mode 100644
index 00000000..204e0be4
--- /dev/null
+++ b/src/tuning/kernels/xgemm_direct.cpp
@@ -0,0 +1,196 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations:
+// - V==1: This tests some limited set of tuning parameters exhaustively.
+// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, int V>
+class TuneXgemmDirect {
+ public:
+
+  // The representative kernel and the source code
+  static std::string KernelFamily() { return (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; }
+  static std::string KernelName() { return "XgemmDirectTN"; }
+  static std::string GetSources() {
+    return
+      #include "../src/kernels/common.opencl"
+      #include "../src/kernels/level3/xgemm_direct_part1.opencl"
+      #include "../src/kernels/level3/xgemm_direct_part2.opencl"
+      #include "../src/kernels/level3/xgemm_direct_part3.opencl"
+    ;
+  }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction};
+  }
+
+  // Tests for valid arguments
+  static void TestValidArguments(const Arguments<T> &) { }
+
+  // Sets the default values for the arguments
+  static size_t DefaultM() { return 256; }
+  static size_t DefaultN() { return 256; }
+  static size_t DefaultK() { return 256; }
+  static double DefaultFraction() { return (V==1) ? 1.0 : 32.0; } // test all or sample randomly
+  static size_t DefaultNumRuns() { return 4; } // run every kernel this many times for averaging
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.k; }
+  static size_t GetSizeB(const Arguments<T> &args) { return args.n * args.k; }
+  static size_t GetSizeC(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
+
+  // Sets the tuning parameters and their possible values
+  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
+    if (V==1) { // limited subset of tuning parameters - but explorable exhaustively
+      tuner.AddParameter(id, "WGD", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMAD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMBD", {8, 16, 32});
+      tuner.AddParameter(id, "KWID", {2});
+      tuner.AddParameter(id, "VWMD", {1, 2, 4, 8});
+      tuner.AddParameter(id, "VWND", {1, 2, 4, 8});
+      tuner.AddParameter(id, "PADA", {1});
+      tuner.AddParameter(id, "PADB", {1});
+    } // a lot more tuning parameters - has to be sampled randomly, too much to test all
+    else {
+      tuner.AddParameter(id, "WGD", {8, 16, 32, 64, 128});
+      tuner.AddParameter(id, "MDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMAD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMBD", {8, 16, 32});
+      tuner.AddParameter(id, "KWID", {2, 8, 16});
+      tuner.AddParameter(id, "VWMD", {1, 2, 4, 8});
+      tuner.AddParameter(id, "VWND", {1, 2, 4, 8});
+      tuner.AddParameter(id, "PADA", {0, 1});
+      tuner.AddParameter(id, "PADB", {0, 1});
+    }
+  }
+
+  // Sets the constraints
+  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+    auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
+    auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
+    auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
+    // Requirement for unrolling the WGD loop
+    tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"});
+    // Required for integer MWID and NWID
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"});
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"});
+    // Required for integer MWIAD and NWIBD
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"});
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"});
+    // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...)
+    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"});
+    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"});
+
+    // Extra constraints for variation 1 to limit the set of options significantly
+    if (V==1) {
+      auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
+      tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"});
+      tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"});
+    }
+  }
+
+  // Sets the local memory size
+  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
+    auto LocalMemorySize = [args] (std::vector<size_t> v) {
+      return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision));
+    };
+    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"});
+  }
+
+  // Sets the base thread configuration
+  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
+  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
+  static std::vector<size_t> LocalSize() { return {1, 1}; }
+  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
+
+  // Transforms the thread configuration based on the parameters
+  using TransformVector = std::vector<std::vector<std::string>>;
+  static TransformVector MulLocal() { return {{"MDIMCD", "NDIMCD"}}; }
+  static TransformVector DivLocal() { return {}; }
+  static TransformVector MulGlobal() { return {{"MDIMCD", "NDIMCD"}}; }
+  static TransformVector DivGlobal() { return {{"WGD", "WGD"}}; }
+
+  // Sets the kernel's arguments
+  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
+                           std::vector<T> &, std::vector<T> &,
+                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
+                           std::vector<T> &) {
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(static_cast<int>(args.n));
+    tuner.AddArgumentScalar(static_cast<int>(args.k));
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+    tuner.AddArgumentScalar(GetRealArg(args.beta));
+    tuner.AddArgumentInput(a_mat);
+    tuner.AddArgumentScalar(0); // a_offset
+    tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld
+    tuner.AddArgumentInput(b_mat);
+    tuner.AddArgumentScalar(0); // b_offset
+    tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld
+    tuner.AddArgumentOutput(c_mat);
+    tuner.AddArgumentScalar(0); // c_offset
+    tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld
+    tuner.AddArgumentScalar(1); // c_do_transpose
+    tuner.AddArgumentScalar(0); // a_conjugate
+    tuner.AddArgumentScalar(0); // b_conjugate
+  }
+
+  // Describes how to compute the performance metrics
+  static size_t GetMetric(const Arguments<T> &args) {
+    return 2 * args.m * args.n * args.k;
+  }
+  static std::string PerformanceUnit() { return "GFLOPS"; }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Function to tune a specific variation V (not within the clblast namespace)
+template <int V>
+void StartVariation(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemmDirect<half,V>, half>(argc, argv); break;
+    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemmDirect<float,V>, float>(argc, argv); break;
+    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemmDirect<double,V>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemmDirect<float2,V>, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemmDirect<double2,V>, double2>(argc, argv); break;
+  }
+}
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  StartVariation<1>(argc, argv);
+  StartVariation<2>(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index 7229602d..f332f52a 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -50,6 +50,7 @@ class TuneXgemv {
   static size_t DefaultN() { return 2048; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index 1fb5c531..c3d0c7dd 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -47,6 +47,7 @@ class TuneXger {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.m; }
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 19df5f9a..afb092bc 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -30,6 +30,7 @@ namespace clblast {
 // that it is automatically compiled for the various kernels (given as the 'C' template argument).
 template <typename C, typename T>
 void Tuner(int argc, char* argv[]) {
+  constexpr auto kSeed = 42; // fixed seed for reproducibility
 
   // Sets the parameters and platform/device for which to tune (command-line options)
   auto help = std::string{"* Options given/available:\n"};
@@ -45,6 +46,8 @@ void Tuner(int argc, char* argv[]) {
     if (o == kArgBeta)     { args.beta     = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
     if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); }
   }
+  const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, C::DefaultNumRuns());
+
   fprintf(stdout, "%s\n", help.c_str());
 
   // Tests validity of the given arguments
@@ -73,12 +76,12 @@ void Tuner(int argc, char* argv[]) {
   auto b_mat = std::vector<T>(C::GetSizeB(args));
   auto c_mat = std::vector<T>(C::GetSizeC(args));
   auto temp = std::vector<T>(C::GetSizeTemp(args));
-  PopulateVector(x_vec);
-  PopulateVector(y_vec);
-  PopulateVector(a_mat);
-  PopulateVector(b_mat);
-  PopulateVector(c_mat);
-  PopulateVector(temp);
+  PopulateVector(x_vec, kSeed);
+  PopulateVector(y_vec, kSeed);
+  PopulateVector(a_mat, kSeed);
+  PopulateVector(b_mat, kSeed);
+  PopulateVector(c_mat, kSeed);
+  PopulateVector(temp, kSeed);
 
   // Initializes the tuner for the chosen device
   cltune::Tuner tuner(args.platform_id, args.device_id);
@@ -126,6 +129,7 @@ void Tuner(int argc, char* argv[]) {
   C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
 
   // Starts the tuning process
+  tuner.SetNumRuns(num_runs);
   tuner.Tune();
 
   // Prints the results to screen
@@ -134,7 +138,7 @@ void Tuner(int argc, char* argv[]) {
 
   // Also prints the performance of the best-case in terms of GB/s or GFLOPS
   if (time_ms != 0.0) {
-    printf("[ -------> ] %.1lf ms", time_ms);
+    printf("[ -------> ] %.2lf ms", time_ms);
     printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
   }
 
diff --git a/src/utilities.cpp b/src/utilities.cpp
index 77bc72d7..86cc2d13 100644
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -270,40 +270,40 @@ unsigned int GetRandomSeed() {
 
 // Create a random number generator and populates a vector with samples from a random distribution
 template <typename T>
-void PopulateVector(std::vector<T> &vector) {
+void PopulateVector(std::vector<T> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<T>(kTestDataLowerLimit);
   auto upper_limit = static_cast<T>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<T> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element = dist(mt); }
 }
-template void PopulateVector<float>(std::vector<float>&);
-template void PopulateVector<double>(std::vector<double>&);
+template void PopulateVector<float>(std::vector<float>&, const unsigned int);
+template void PopulateVector<double>(std::vector<double>&, const unsigned int);
 
 // Specialized versions of the above for complex data-types
 template <>
-void PopulateVector(std::vector<float2> &vector) {
+void PopulateVector(std::vector<float2> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<float>(kTestDataLowerLimit);
   auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
 }
 template <>
-void PopulateVector(std::vector<double2> &vector) {
+void PopulateVector(std::vector<double2> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<double>(kTestDataLowerLimit);
   auto upper_limit = static_cast<double>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<double> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
 }
 
 // Specialized versions of the above for half-precision
 template <>
-void PopulateVector(std::vector<half> &vector) {
+void PopulateVector(std::vector<half> &vector, const unsigned int seed) {
   const auto lower_limit = static_cast<float>(kTestDataLowerLimit);
   const auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element = FloatToHalf(dist(mt)); }
 }
diff --git a/src/utilities.hpp b/src/utilities.hpp
index 75bd5a69..038a8a96 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -25,6 +25,8 @@
 #include "clblast_half.h"
 #include "clpp11.hpp"
 
+#include "msvc.hpp"
+
 namespace clblast {
 // =================================================================================================
 
@@ -206,7 +208,7 @@ bool CheckArgument(const int argc, char *argv[], std::string &help, const std::s
 // =================================================================================================
 
 // Helper function to check for errors in the status code
-constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
+inline bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
 
 // =================================================================================================
 
@@ -219,7 +221,7 @@ constexpr auto kTestDataUpperLimit = 2.0;
 
 // Populates a vector with random data
 template <typename T>
-void PopulateVector(std::vector<T> &vector);
+void PopulateVector(std::vector<T> &vector, const unsigned int seed);
 
 // =================================================================================================
 
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 2e751255..fc908b9e 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -66,13 +66,13 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
   c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
   ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
   scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset);
-  PopulateVector(x_source_);
-  PopulateVector(y_source_);
-  PopulateVector(a_source_);
-  PopulateVector(b_source_);
-  PopulateVector(c_source_);
-  PopulateVector(ap_source_);
-  PopulateVector(scalar_source_);
+  PopulateVector(x_source_, kSeed);
+  PopulateVector(y_source_, kSeed);
+  PopulateVector(a_source_, kSeed);
+  PopulateVector(b_source_, kSeed);
+  PopulateVector(c_source_, kSeed);
+  PopulateVector(ap_source_, kSeed);
+  PopulateVector(scalar_source_, kSeed);
 }
 
 // ===============================================================================================
diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp
index d01cd06c..4b773801 100644
--- a/test/correctness/testblas.hpp
+++ b/test/correctness/testblas.hpp
@@ -30,6 +30,7 @@ namespace clblast {
 template <typename T, typename U>
 class TestBlas: public Tester<T,U> {
  public:
+  static constexpr auto kSeed = 42; // fixed seed for reproducibility
 
   // Uses several variables from the Tester class
   using Tester<T,U>::context_;
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index 362c5c2c..41e457b6 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -358,28 +358,33 @@ void Tester<T,U>::PrintErrorLog(const std::vector<ErrorLogEntry> &error_log) {
 // Compares two floating point values and returns whether they are within an acceptable error
 // margin. This replaces GTest's EXPECT_NEAR().
 template <typename T>
-bool TestSimilarity(const T val1, const T val2) {
+bool TestSimilarityNear(const T val1, const T val2,
+                        const T error_margin_absolute, const T error_margin_relative) {
   const auto difference = std::fabs(val1 - val2);
 
-  // Set the allowed error margin for floating-point comparisons
-  constexpr auto kErrorMarginRelative = T(0.025);
-  constexpr auto kErrorMarginAbsolute = T(1.0e-3);
-
   // Shortcut, handles infinities
   if (val1 == val2) {
     return true;
   }
   // The values are zero or very small: the relative error is less meaningful
-  else if (val1 == 0 || val2 == 0 || difference < kErrorMarginAbsolute) {
-    return (difference < kErrorMarginAbsolute);
+  else if (val1 == 0 || val2 == 0 || difference < error_margin_absolute) {
+    return (difference < error_margin_absolute);
   }
   // Use relative error
   else {
     const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
-    return (difference / absolute_sum) < kErrorMarginRelative;
+    return (difference / absolute_sum) < error_margin_relative;
   }
 }
 
+// Default method for similarity testing
+template <typename T>
+bool TestSimilarity(const T val1, const T val2) {
+  constexpr auto kErrorMarginRelative = T(0.025);
+  constexpr auto kErrorMarginAbsolute = T(0.001);
+  return TestSimilarityNear(val1, val2, kErrorMarginRelative, kErrorMarginAbsolute);
+}
+
 // Compiles the default case for standard data-types
 template bool TestSimilarity<float>(const float, const float);
 template bool TestSimilarity<double>(const double, const double);
@@ -399,7 +404,10 @@ bool TestSimilarity(const double2 val1, const double2 val2) {
 }
 template <>
 bool TestSimilarity(const half val1, const half val2) {
-  return TestSimilarity(HalfToFloat(val1), HalfToFloat(val2));
+  constexpr auto kErrorMarginRelative = float(0.050);
+  constexpr auto kErrorMarginAbsolute = float(0.002);
+  return TestSimilarityNear(HalfToFloat(val1), HalfToFloat(val2),
+                            kErrorMarginRelative, kErrorMarginAbsolute);
 }
 
 // =================================================================================================
diff --git a/test/performance/client.cpp b/test/performance/client.cpp
index aaaab22e..cbb10d10 100644
--- a/test/performance/client.cpp
+++ b/test/performance/client.cpp
@@ -178,13 +178,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
     std::vector<T> c_source(args.c_size);
     std::vector<T> ap_source(args.ap_size);
     std::vector<T> scalar_source(args.scalar_size);
-    PopulateVector(x_source);
-    PopulateVector(y_source);
-    PopulateVector(a_source);
-    PopulateVector(b_source);
-    PopulateVector(c_source);
-    PopulateVector(ap_source);
-    PopulateVector(scalar_source);
+    PopulateVector(x_source, kSeed);
+    PopulateVector(y_source, kSeed);
+    PopulateVector(a_source, kSeed);
+    PopulateVector(b_source, kSeed);
+    PopulateVector(c_source, kSeed);
+    PopulateVector(ap_source, kSeed);
+    PopulateVector(scalar_source, kSeed);
 
     // Creates the matrices on the device
     auto x_vec = Buffer<T>(context, args.x_size);
diff --git a/test/performance/client.hpp b/test/performance/client.hpp
index 6d35fced..381ba158 100644
--- a/test/performance/client.hpp
+++ b/test/performance/client.hpp
@@ -40,6 +40,7 @@ namespace clblast {
 template <typename T, typename U>
 class Client {
  public:
+  static constexpr auto kSeed = 42; // fixed seed for reproducibility
 
   // Shorthand for the routine-specific functions passed to the tester
   using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp
index 4423845e..f98bdb06 100644
--- a/test/routines/level1/xamax.hpp
+++ b/test/routines/level1/xamax.hpp
@@ -76,7 +76,7 @@ class TestXamax {
                           buffers.scalar(), args.imax_offset,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp
index b1f02dcd..64aa37c2 100644
--- a/test/routines/level1/xasum.hpp
+++ b/test/routines/level1/xasum.hpp
@@ -76,7 +76,7 @@ class TestXasum {
                           buffers.scalar(), args.asum_offset,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp
index c276a42e..b24e6fe8 100644
--- a/test/routines/level1/xaxpy.hpp
+++ b/test/routines/level1/xaxpy.hpp
@@ -77,7 +77,7 @@ class TestXaxpy {
                        buffers.x_vec(), args.x_offset, args.x_inc,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp
index a96bb9ae..87bc21d4 100644
--- a/test/routines/level1/xcopy.hpp
+++ b/test/routines/level1/xcopy.hpp
@@ -76,7 +76,7 @@ class TestXcopy {
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           buffers.y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp
index f6cf2809..c4f6076a 100644
--- a/test/routines/level1/xdot.hpp
+++ b/test/routines/level1/xdot.hpp
@@ -81,7 +81,7 @@ class TestXdot {
                          buffers.x_vec(), args.x_offset, args.x_inc,
                          buffers.y_vec(), args.y_offset, args.y_inc,
                          &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp
index 2b00d04b..aae892a8 100644
--- a/test/routines/level1/xdotc.hpp
+++ b/test/routines/level1/xdotc.hpp
@@ -81,7 +81,7 @@ class TestXdotc {
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           buffers.y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp
index 31a867e0..f6be385b 100644
--- a/test/routines/level1/xdotu.hpp
+++ b/test/routines/level1/xdotu.hpp
@@ -81,7 +81,7 @@ class TestXdotu {
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           buffers.y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp
index 62d649e3..e604077c 100644
--- a/test/routines/level1/xnrm2.hpp
+++ b/test/routines/level1/xnrm2.hpp
@@ -76,7 +76,7 @@ class TestXnrm2 {
                           buffers.scalar(), args.nrm2_offset,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp
index 79926890..3c438bd6 100644
--- a/test/routines/level1/xscal.hpp
+++ b/test/routines/level1/xscal.hpp
@@ -72,7 +72,7 @@ class TestXscal {
     auto status = Scal(args.n, args.alpha,
                        buffers.x_vec(), args.x_offset, args.x_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp
index 8f7e4cfe..a0491f12 100644
--- a/test/routines/level1/xswap.hpp
+++ b/test/routines/level1/xswap.hpp
@@ -76,7 +76,7 @@ class TestXswap {
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           buffers.y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp
index 5a907077..5ed92aae 100644
--- a/test/routines/level2/xgbmv.hpp
+++ b/test/routines/level2/xgbmv.hpp
@@ -90,7 +90,7 @@ class TestXgbmv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp
index 1499b2d2..9ee6d535 100644
--- a/test/routines/level2/xgemv.hpp
+++ b/test/routines/level2/xgemv.hpp
@@ -90,7 +90,7 @@ class TestXgemv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp
index 5cbed505..42283107 100644
--- a/test/routines/level2/xger.hpp
+++ b/test/routines/level2/xger.hpp
@@ -86,7 +86,7 @@ class TestXger {
                       buffers.y_vec(), args.y_offset, args.y_inc,
                       buffers.a_mat(), args.a_offset, args.a_ld,
                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp
index d50092cb..ef69c197 100644
--- a/test/routines/level2/xgerc.hpp
+++ b/test/routines/level2/xgerc.hpp
@@ -86,7 +86,7 @@ class TestXgerc {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp
index 9c823b73..b2afc6d8 100644
--- a/test/routines/level2/xgeru.hpp
+++ b/test/routines/level2/xgeru.hpp
@@ -86,7 +86,7 @@ class TestXgeru {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp
index 01cb3f51..8bda4d0c 100644
--- a/test/routines/level2/xhbmv.hpp
+++ b/test/routines/level2/xhbmv.hpp
@@ -84,7 +84,7 @@ class TestXhbmv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp
index dadd3975..80565d04 100644
--- a/test/routines/level2/xhemv.hpp
+++ b/test/routines/level2/xhemv.hpp
@@ -84,7 +84,7 @@ class TestXhemv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp
index b21c0a9b..d71c8009 100644
--- a/test/routines/level2/xher.hpp
+++ b/test/routines/level2/xher.hpp
@@ -79,7 +79,7 @@ class TestXher {
                       buffers.x_vec(), args.x_offset, args.x_inc,
                       buffers.a_mat(), args.a_offset, args.a_ld,
                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp
index 070f823c..083dfa2f 100644
--- a/test/routines/level2/xher2.hpp
+++ b/test/routines/level2/xher2.hpp
@@ -84,7 +84,7 @@ class TestXher2 {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp
index d7f9634e..1dd63562 100644
--- a/test/routines/level2/xhpmv.hpp
+++ b/test/routines/level2/xhpmv.hpp
@@ -84,7 +84,7 @@ class TestXhpmv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp
index 8f44a68d..a5c77811 100644
--- a/test/routines/level2/xhpr.hpp
+++ b/test/routines/level2/xhpr.hpp
@@ -79,7 +79,7 @@ class TestXhpr {
                       buffers.x_vec(), args.x_offset, args.x_inc,
                       buffers.ap_mat(), args.ap_offset,
                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp
index 666a8dfc..d09178f0 100644
--- a/test/routines/level2/xhpr2.hpp
+++ b/test/routines/level2/xhpr2.hpp
@@ -84,7 +84,7 @@ class TestXhpr2 {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.ap_mat(), args.ap_offset,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp
index fd5dd68e..8e0f8321 100644
--- a/test/routines/level2/xsbmv.hpp
+++ b/test/routines/level2/xsbmv.hpp
@@ -84,7 +84,7 @@ class TestXsbmv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp
index 63286248..977f733a 100644
--- a/test/routines/level2/xspmv.hpp
+++ b/test/routines/level2/xspmv.hpp
@@ -84,7 +84,7 @@ class TestXspmv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp
index f9dead53..93da4b73 100644
--- a/test/routines/level2/xspr.hpp
+++ b/test/routines/level2/xspr.hpp
@@ -79,7 +79,7 @@ class TestXspr {
                       buffers.x_vec(), args.x_offset, args.x_inc,
                       buffers.ap_mat(), args.ap_offset,
                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp
index a2f22098..b835f2b0 100644
--- a/test/routines/level2/xspr2.hpp
+++ b/test/routines/level2/xspr2.hpp
@@ -84,7 +84,7 @@ class TestXspr2 {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.ap_mat(), args.ap_offset,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp
index 0d3ca632..0ec96f1d 100644
--- a/test/routines/level2/xsymv.hpp
+++ b/test/routines/level2/xsymv.hpp
@@ -84,7 +84,7 @@ class TestXsymv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp
index 15ad9595..b49132e3 100644
--- a/test/routines/level2/xsyr.hpp
+++ b/test/routines/level2/xsyr.hpp
@@ -79,7 +79,7 @@ class TestXsyr {
                       buffers.x_vec(), args.x_offset, args.x_inc,
                       buffers.a_mat(), args.a_offset, args.a_ld,
                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp
index a9a61a1f..7c65daa2 100644
--- a/test/routines/level2/xsyr2.hpp
+++ b/test/routines/level2/xsyr2.hpp
@@ -84,7 +84,7 @@ class TestXsyr2 {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp
index 54e7fe18..cf30c2f7 100644
--- a/test/routines/level2/xtbmv.hpp
+++ b/test/routines/level2/xtbmv.hpp
@@ -78,7 +78,7 @@ class TestXtbmv {
                           buffers.a_mat(), args.a_offset, args.a_ld,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp
index 9776c4de..d08e132f 100644
--- a/test/routines/level2/xtpmv.hpp
+++ b/test/routines/level2/xtpmv.hpp
@@ -78,7 +78,7 @@ class TestXtpmv {
                           buffers.ap_mat(), args.ap_offset,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp
index 18300e50..cf9a0063 100644
--- a/test/routines/level2/xtrmv.hpp
+++ b/test/routines/level2/xtrmv.hpp
@@ -78,7 +78,7 @@ class TestXtrmv {
                           buffers.a_mat(), args.a_offset, args.a_ld,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp
index 5f9bea81..bca3c049 100644
--- a/test/routines/level3/xgemm.hpp
+++ b/test/routines/level3/xgemm.hpp
@@ -92,7 +92,7 @@ class TestXgemm {
                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
                        buffers.c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp
index 8c44be25..31c7695f 100644
--- a/test/routines/level3/xhemm.hpp
+++ b/test/routines/level3/xhemm.hpp
@@ -92,7 +92,7 @@ class TestXhemm {
                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
                        buffers.c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp
index fd20bbb5..ff2bb6cb 100644
--- a/test/routines/level3/xher2k.hpp
+++ b/test/routines/level3/xher2k.hpp
@@ -91,7 +91,7 @@ class TestXher2k {
                         buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
                         buffers.c_mat(), args.c_offset, args.c_ld,
                         &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp
index 12990d39..26396fa9 100644
--- a/test/routines/level3/xherk.hpp
+++ b/test/routines/level3/xherk.hpp
@@ -82,7 +82,7 @@ class TestXherk {
                        buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
                        buffers.c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp
index f8e90927..c84c22b4 100644
--- a/test/routines/level3/xsymm.hpp
+++ b/test/routines/level3/xsymm.hpp
@@ -92,7 +92,7 @@ class TestXsymm {
                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
                        buffers.c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp
index 4e4ba0b7..5c4976e2 100644
--- a/test/routines/level3/xsyr2k.hpp
+++ b/test/routines/level3/xsyr2k.hpp
@@ -90,7 +90,7 @@ class TestXsyr2k {
                         buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
                         buffers.c_mat(), args.c_offset, args.c_ld,
                         &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp
index f5509c88..98c4f6a4 100644
--- a/test/routines/level3/xsyrk.hpp
+++ b/test/routines/level3/xsyrk.hpp
@@ -82,7 +82,7 @@ class TestXsyrk {
                        buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
                        buffers.c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp
index 45e17e45..55b51e54 100644
--- a/test/routines/level3/xtrmm.hpp
+++ b/test/routines/level3/xtrmm.hpp
@@ -82,7 +82,7 @@ class TestXtrmm {
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        buffers.b_mat(), args.b_offset, args.b_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp
index 4637c07e..dccb3583 100644
--- a/test/routines/levelx/xomatcopy.hpp
+++ b/test/routines/levelx/xomatcopy.hpp
@@ -77,7 +77,7 @@ class TestXomatcopy {
                               buffers.a_mat(), args.a_offset, args.a_ld,
                               buffers.b_mat(), args.b_offset, args.b_ld,
                               &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }