118 files changed, 2290 insertions, 383 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 1995dc84..2affaadd 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,15 @@
 
+Development version (next release)
+- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
+- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
+- Fixed a bug in the tests and samples related to waiting for an invalid event
+- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0)
+- Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS
+- Added an option to run tuned kernels multiple times to average execution times
+- Added an option to build a static version of the library
+- Various minor fixes and enhancements
+- Added tuned parameters for various devices (see README)
+
 Version 0.9.0
 - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
 - Improved performance significantly of rotated GEMV computations
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9f32a811..ae78b5a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,7 @@ set(clblast_VERSION_MINOR 9)
 set(clblast_VERSION_PATCH 0)
 
 # Options and their default values
+option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON)
 option(SAMPLES "Enable compilation of the examples" OFF)
 option(TUNERS "Enable compilation of the tuners" OFF)
 option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
@@ -64,12 +65,26 @@ elseif(MSVC)
   endif()
 endif()
 
+# DLL Settings
+if(MSVC)
+  if(BUILD_SHARED_LIBS)
+    add_definitions(" /DCLBLAST_DLL")
+  else(BUILD_SHARED_LIBS)
+    add_definitions(" /DCLBLAST_STATIC")
+  endif(BUILD_SHARED_LIBS)
+endif(MSVC)
+
 # C++ compiler settings
 if(MSVC)
   set(FLAGS "/Ox")
   set(FLAGS "${FLAGS} /wd4715")
 else()
-  set(FLAGS "-O3 -std=c++11")
+  set(FLAGS "-std=c++11")
+  if(VERBOSE)
+    set(FLAGS "${FLAGS} -O1 -g")
+  else()
+    set(FLAGS "${FLAGS} -O3")
+  endif()
   if(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
     set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
     if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
@@ -134,7 +149,8 @@ endif()
 # ==================================================================================================
 
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
-set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv)
+set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
+            xgemm xgemm_direct xgemv)
 set(SAMPLE_PROGRAMS_CPP sgemm)
 set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
@@ -171,7 +187,12 @@ foreach(ROUTINE ${LEVELX_ROUTINES})
 endforeach()
 
 # Creates and links the library
-add_library(clblast SHARED ${SOURCES})
+if(BUILD_SHARED_LIBS)
+  add_library(clblast SHARED ${SOURCES})
+else(BUILD_SHARED_LIBS)
+  add_library(clblast STATIC ${SOURCES})
+endif(BUILD_SHARED_LIBS)
+
 target_link_libraries(clblast ${OPENCL_LIBRARIES})
 
 # Includes directories: CLBlast and OpenCL
@@ -183,7 +204,9 @@ target_include_directories(clblast PUBLIC
 
 # Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built
 if(MSVC)
-  target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
+  if(BUILD_SHARED_LIBS)
+    target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
+  endif(BUILD_SHARED_LIBS)
 endif()
 
 # Installs the library
@@ -205,7 +228,7 @@ endif()
 
 # ==================================================================================================
 
-# Sets a default platform ($DEVICEPLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests on
+# Sets a default platform ($CLBLAST_PLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests
 set(DEVICEPLATFORM )
 if(DEFINED ENV{CLBLAST_DEVICE})
   set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{CLBLAST_DEVICE})
@@ -214,6 +237,12 @@ if(DEFINED ENV{CLBLAST_PLATFORM})
   set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{CLBLAST_PLATFORM})
 endif()
 
+# Optionally also provides other options to the tests such as -full_test ($CLBLAST_TEST_ARGUMENTS)
+set(TEST_ARGUMENTS )
+if(DEFINED ENV{CLBLAST_TEST_ARGUMENTS})
+  set(TEST_ARGUMENTS $ENV{CLBLAST_TEST_ARGUMENTS})
+endif()
+
 # ==================================================================================================
 
 # This section contains all the code related to the examples
@@ -280,8 +309,9 @@ if(CLIENTS OR TESTS)
   set(REF_INCLUDES )
   set(REF_LIBRARIES )
   if(CLBLAS_FOUND)
+    find_package(Threads)
+    set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
     set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
-    set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
     if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
       add_definitions(" /DCLBLAST_REF_CLBLAS")
     else()
@@ -317,7 +347,7 @@ if(CLIENTS)
     # Adds CLBlast's interface include paths because we can't link to CLBlast here
     target_include_directories(test_performance_common PRIVATE
                                $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
-                               ${clblast_SOURCE_DIR})
+                               ${clblast_SOURCE_DIR} ${REF_INCLUDES})
     set(CLIENTS_COMMON ${CLIENTS_COMMON} $<TARGET_OBJECTS:test_performance_common>)
   endif()
 
@@ -364,7 +394,7 @@ if(TESTS)
                 test/correctness/tester.cpp test/correctness/testblas.cpp)
     target_include_directories(test_correctness_common PUBLIC
                                $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
-                               ${clblast_SOURCE_DIR})
+                               ${clblast_SOURCE_DIR} ${REF_INCLUDES})
     set(TESTS_COMMON ${TESTS_COMMON} $<TARGET_OBJECTS:test_correctness_common>)
   endif()
 
@@ -389,14 +419,14 @@ if(TESTS)
     target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
     install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
     target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
-    add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM})
+    add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM} ${TEST_ARGUMENTS})
   endforeach()
 
   # Adds 'alltests' target: runs all tests
   set(ALLTESTS )
   set(ALLTESTSDEPENDS )
   foreach(ROUTINE ${ROUTINES})
-    set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE} ${DEVICEPLATFORM})
+    set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE} ${DEVICEPLATFORM} ${TEST_ARGUMENTS})
     set(ALLTESTSDEPENDS clblast_test_${ROUTINE})
   endforeach()
   add_custom_target(alltests ${ALLTESTS} DEPENDS ${ALLTESTSDEPENDS})
diff --git a/LICENSE b/LICENSE
index ae43189f..0df827ea 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,14 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
 
-Copyright (c) 2015 Cedric Nugteren
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   1. Definitions.
 
- http://www.apache.org/licenses/LICENSE-2.0
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2015 Cedric Nugteren
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 7f6a3d96..ec73c37c 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ The pre-requisites for compilation of CLBlast are:
   - Clang 3.3 or newer
   - AppleClang 5.0 or newer
   - ICC 14.0 or newer
-  - MSVC (Visual Studio) 2015 or newer
+  - MSVC (Visual Studio) 2013 or newer
 * An OpenCL 1.1 or newer library, for example:
   - Apple OpenCL
   - NVIDIA CUDA SDK
@@ -74,6 +74,10 @@ A custom installation folder can be specified when calling CMake:
 
     cmake -DCMAKE_INSTALL_PREFIX=/path/to/install/directory ..
 
+Building a static version of the library instead of shared one (.dylib/.so/.dll) can be done by disabling the `BUILD_SHARED_LIBS` option when calling CMake. For example:
+
+    cmake -DBUILD_SHARED_LIBS=OFF ..
+
 
 Using the library
 -------------
@@ -90,6 +94,8 @@ Afterwards, any of CLBlast's routines can be called directly: there is no need t
 
     cmake -DSAMPLES=ON ..
 
+Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
+
 
 Using the tuners (optional)
 -------------
@@ -117,8 +123,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
   - Tahiti
 * Intel GPUs:
   - HD Graphics 530
-  - HD Graphics Haswell Ultrabook GT2 Mobile
   - HD Graphics 5500 BroadWell U-Processor GT2
+  - HD Graphics Haswell Ultrabook GT2 Mobile
+  - HD Graphics IvyBridge M GT2
   - HD Graphics Skylake ULT GT2
   - Iris
   - Iris Pro
@@ -134,7 +141,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
 
     cmake -DTUNERS=ON ..
 
-Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher).
+Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.5.0 or higher).
 
 Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
 
@@ -168,7 +175,7 @@ To build these tests, another BLAS library is needed to serve as a reference. Th
 
 Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further.
 
-All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
+All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake. Further options (e.g. `-full_test`) can be supplied through the `CLBLAST_TEST_ARGUMENTS` environmental variable.
 
 
 Compiling the performance tests/clients (optional)
diff --git a/doc/performance/GeForce_GTX480/SAXPY.pdf b/doc/performance/GeForce_GTX480/SAXPY.pdf
deleted file mode 100644
index 6e1c8f5a..00000000
--- a/doc/performance/GeForce_GTX480/SAXPY.pdf
+++ /dev/null
diff --git a/doc/performance/GeForce_GTX480/SGEMM.pdf b/doc/performance/GeForce_GTX480/SGEMM.pdf
deleted file mode 100644
index f430f880..00000000
--- a/doc/performance/GeForce_GTX480/SGEMM.pdf
+++ /dev/null
diff --git a/doc/performance/GeForce_GTX480/SGEMV.pdf b/doc/performance/GeForce_GTX480/SGEMV.pdf
deleted file mode 100644
index 8cb57124..00000000
--- a/doc/performance/GeForce_GTX480/SGEMV.pdf
+++ /dev/null
diff --git a/doc/performance/GeForce_GTX480/SSYMM.pdf b/doc/performance/GeForce_GTX480/SSYMM.pdf
deleted file mode 100644
index ff5941ad..00000000
--- a/doc/performance/GeForce_GTX480/SSYMM.pdf
+++ /dev/null
diff --git a/doc/performance/GeForce_GTX750Ti/SAXPY.pdf b/doc/performance/GeForce_GTX750Ti/SAXPY.pdf
new file mode 100644
index 00000000..531baa79
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SAXPY.pdf
diff --git a/doc/performance/GeForce_GTX750Ti/SGEMM.pdf b/doc/performance/GeForce_GTX750Ti/SGEMM.pdf
new file mode 100644
index 00000000..dcd62929
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SGEMM.pdf
diff --git a/doc/performance/GeForce_GTX750Ti/SGEMV.pdf b/doc/performance/GeForce_GTX750Ti/SGEMV.pdf
new file mode 100644
index 00000000..a4c3efb3
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SGEMV.pdf
diff --git a/doc/performance/GeForce_GTX750Ti/SSYMM.pdf b/doc/performance/GeForce_GTX750Ti/SSYMM.pdf
new file mode 100644
index 00000000..43d97d24
--- /dev/null
+++ b/doc/performance/GeForce_GTX750Ti/SSYMM.pdf
diff --git a/doc/performance/Intel_IrisPro/SAXPY.pdf b/doc/performance/Intel_IrisPro/SAXPY.pdf
index 3a51f306..8d639b24 100644
--- a/doc/performance/Intel_IrisPro/SAXPY.pdf
+++ b/doc/performance/Intel_IrisPro/SAXPY.pdf
diff --git a/doc/performance/Intel_IrisPro/SGEMM.pdf b/doc/performance/Intel_IrisPro/SGEMM.pdf
index 15f1714f..31725025 100644
--- a/doc/performance/Intel_IrisPro/SGEMM.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMM.pdf
diff --git a/doc/performance/Intel_IrisPro/SGEMV.pdf b/doc/performance/Intel_IrisPro/SGEMV.pdf
index e1660999..9ec120c4 100644
--- a/doc/performance/Intel_IrisPro/SGEMV.pdf
+++ b/doc/performance/Intel_IrisPro/SGEMV.pdf
diff --git a/doc/performance/Radeon_M370X/SGEMM.pdf b/doc/performance/Radeon_M370X/SGEMM.pdf
index 5dca8f03..da5722f9 100644
--- a/doc/performance/Radeon_M370X/SGEMM.pdf
+++ b/doc/performance/Radeon_M370X/SGEMM.pdf
diff --git a/doc/performance/Radeon_M370X/SGEMV.pdf b/doc/performance/Radeon_M370X/SGEMV.pdf
index fa661249..513318bf 100644
--- a/doc/performance/Radeon_M370X/SGEMV.pdf
+++ b/doc/performance/Radeon_M370X/SGEMV.pdf
diff --git a/doc/performance/Radeon_M370X/SSYMM.pdf b/doc/performance/Radeon_M370X/SSYMM.pdf
index 852181d1..03efd198 100644
--- a/doc/performance/Radeon_M370X/SSYMM.pdf
+++ b/doc/performance/Radeon_M370X/SSYMM.pdf
diff --git a/include/clblast.h b/include/clblast.h
index e1d4f25b..0f52b2f9 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -27,8 +27,8 @@
 
 // Exports library functions under Windows when building a DLL. See also:
 // https://msdn.microsoft.com/en-us/library/a90k134d.aspx
-#ifdef _WIN32
-  #ifdef COMPILING_DLL
+#if defined(_WIN32) && defined(CLBLAST_DLL)
+  #if defined(COMPILING_DLL)
     #define PUBLIC_API __declspec(dllexport)
   #else
     #define PUBLIC_API __declspec(dllimport)
diff --git a/include/clblast_c.h b/include/clblast_c.h
index a13b8e64..33fb4acf 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -24,8 +24,8 @@
 
 // Exports library functions under Windows when building a DLL. See also:
 // https://msdn.microsoft.com/en-us/library/a90k134d.aspx
-#ifdef _WIN32
-  #ifdef COMPILING_DLL
+#if defined(_WIN32) && defined(CLBLAST_DLL)
+  #if defined(COMPILING_DLL)
     #define PUBLIC_API __declspec(dllexport)
   #else
     #define PUBLIC_API __declspec(dllimport)
diff --git a/include/clblast_half.h b/include/clblast_half.h
index 269a520e..05d96f9f 100644
--- a/include/clblast_half.h
+++ b/include/clblast_half.h
@@ -25,6 +25,11 @@
   #include <CL/opencl.h>
 #endif
 
+// MSVC 2013 doesn't fully support C99
+#ifdef _MSC_VER
+    #define inline __inline
+#endif
+
 // =================================================================================================
 
 // Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type,
diff --git a/samples/cache.c b/samples/cache.c
index a592824d..abc8ad4b 100644
--- a/samples/cache.c
+++ b/samples/cache.c
@@ -112,8 +112,10 @@ void run_example_routine(const cl_device_id device) {
                                    &queue, &event);
 
   // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
 
   // Retrieves the execution time
   clock_t diff = clock() - start;
diff --git a/samples/dgemv.c b/samples/dgemv.c
index c22c9f37..a15d649a 100644
--- a/samples/dgemv.c
+++ b/samples/dgemv.c
@@ -84,8 +84,10 @@ int main(void) {
                                    &queue, &event);
 
   // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
 
   // Example completed. See "clblast_c.h" for status codes (0 -> success).
   printf("Completed DGEMV with status %d\n", status);
diff --git a/samples/haxpy.c b/samples/haxpy.c
index d5b98e12..5bab3d42 100644
--- a/samples/haxpy.c
+++ b/samples/haxpy.c
@@ -77,8 +77,10 @@ int main(void) {
                                    &queue, &event);
 
   // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
 
   // Copies the result back to the host
   clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
diff --git a/samples/sasum.c b/samples/sasum.c
index 1518cc13..02f924b0 100644
--- a/samples/sasum.c
+++ b/samples/sasum.c
@@ -73,8 +73,10 @@ int main(void) {
                                    &queue, &event);
 
   // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
 
   // Copies the result back to the host
   clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
diff --git a/samples/sgemm.c b/samples/sgemm.c
index b4827777..583fc261 100644
--- a/samples/sgemm.c
+++ b/samples/sgemm.c
@@ -87,8 +87,10 @@ int main(void) {
                                    &queue, &event);
 
   // Wait for completion
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
 
   // Example completed. See "clblast_c.h" for status codes (0 -> success).
   printf("Completed SGEMM with status %d\n", status);
diff --git a/samples/sgemm.cpp b/samples/sgemm.cpp
index a4b89968..401ecff8 100644
--- a/samples/sgemm.cpp
+++ b/samples/sgemm.cpp
@@ -95,8 +95,10 @@ int main() {
                               &queue_plain, &event);
 
   // Record the execution time
-  clWaitForEvents(1, &event);
-  clReleaseEvent(event);
+  if (status == clblast::StatusCode::kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
   auto elapsed_time = std::chrono::steady_clock::now() - start_time;
   auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
 
diff --git a/scripts/database/database.py b/scripts/database/database.py
index f758a2b7..31f313da 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -18,7 +18,7 @@ import database.bests as bests
 import database.defaults as defaults
 
 # Server storing a copy of the database
-DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.json"
+DATABASE_SERVER_URL = "https://raw.githubusercontent.com/CNugteren/CLBlast-database/master/database.json"
 
 # OpenCL vendor names and their short name
 VENDOR_TRANSLATION_TABLE = {
diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
index 8190f225..d89b6350 100644
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@@ -54,19 +54,20 @@ def get_cpp_header(family):
 //
 // This file populates the database with best-found tuning parameters for the '%s' kernels.
 //\n"""
-            % family.title() + get_cpp_separator() + "\n\nnamespace clblast {\n" + get_cpp_separator())
+            % family.title() + get_cpp_separator() + \
+             "\n\nnamespace clblast {\n" + "namespace database {\n" + get_cpp_separator())
 
 
 def get_cpp_footer():
     """Retrieves the C++ footer"""
-    return "\n} // namespace clblast\n"
+    return "\n} // namespace database\n" + "} // namespace clblast\n"
 
 
 def get_cpp_precision(family, precision):
     """Retrieves the C++ code for the start of a new precision"""
     precision_string = precision_to_string(precision)
     camelcase_name = family.title().replace("_", "")
-    return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n  \"%s\", Precision::k%s, {\n"
+    return("\n\nconst Database::DatabaseEntry %s%s = {\n  \"%s\", Precision::k%s, {\n"
            % (camelcase_name, precision_string, camelcase_name, precision_string))
 
 
diff --git a/scripts/graphs/common.r b/scripts/graphs/common.r
index cd68cf26..e5dad616 100644
--- a/scripts/graphs/common.r
+++ b/scripts/graphs/common.r
@@ -31,8 +31,12 @@ options("width"=170)
 
 # ==================================================================================================
 
-# Constants
-num_runs <- 4
+# Settings
+num_runs <- 5
+num_runs_short <- 50
+xtics_subset_threshold <- 100
+xtics_subset_stepsize <- 8
+
 devices <- c("-platform","-device")
 options_string <- "-q -no_abbrv -cblas 0"
 library_names <- c("CLBlast", "clBLAS")
@@ -66,11 +70,21 @@ main <- function(routine_name, precision, test_names, test_values,
   executable <- paste("./clblast_client_", routine_name, sep="")
 
   # Configures the outputfile
-  pdf(paste(display_name, ".pdf", sep=""), height=8, width=13)
-  par(mfrow=c(2, 3))
-  par(oma=c(0, 0, 0, 0))
-  par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
-  par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+  file_name <- paste(display_name, ".pdf", sep="")
+  if (length(test_names) == 6) {
+    pdf(file_name, height=8, width=13)
+    par(mfrow=c(2, 3))
+    par(oma=c(0, 0, 0, 0))
+    par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
+    par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+  }
+  else { # length(test_names) == 2
+    pdf(file_name, height=8, width=13)
+    par(mfrow=c(2, 1))
+    par(oma=c(0, 0, 0, 0))
+    par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
+    par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+  }
 
   # Loops over the test-cases
   for (test_id in 1:length(test_names)) {
@@ -169,7 +183,12 @@ plot_graph <- function(xdata, ydata, log_setting,
        main="", xlab="", ylab="",
        ylim=c(ymin, ymax), xlim=c(xmin, xmax), axes=F, "n")
   axis(side=2, las=2)
-  axis(side=1, at=xdata, labels=xtics, las=2)
+  if (length(xdata) > xtics_subset_threshold) {  # Too many indices to print, plot only every Nth
+    subset <- seq(from=1, to=length(xdata), by=xtics_subset_stepsize)
+    axis(side=1, at=xdata[subset], labels=xtics[subset], las=2)
+  } else {
+    axis(side=1, at=xdata, labels=xtics, las=2)
+  }
   title(xlab=xlabel, line=-1)
   title(ylab=ylabel, line=2)
   title(graph_title, line=-2)
diff --git a/scripts/graphs/xgemm_small.r b/scripts/graphs/xgemm_small.r
new file mode 100644
index 00000000..ef94ef20
--- /dev/null
+++ b/scripts/graphs/xgemm_small.r
@@ -0,0 +1,56 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for small sizes of Xgemm, testing the direct kernel
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xgemm"
+parameters <- c("-m","-n","-k","-layout","-transA","-transB",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "small matrices in steps of 16",
+  "small matrices in steps of 1"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c( 128,  128,  128, 102, 111, 111, 57, 16, num_runs_short, precision)),
+  list(c( 128,  128,  128, 102, 111, 111, 385, 1, num_runs_short, precision))
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "matrix sizes (m=n=k)",
+  "matrix sizes (m=n=k)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("m", ""),
+  c("m", "")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ==================================================================================================
+\ No newline at end of file
diff --git a/scripts/graphs/xsymm.r b/scripts/graphs/xsymm.r
index a65bb16f..89d137d2 100644
--- a/scripts/graphs/xsymm.r
+++ b/scripts/graphs/xsymm.r
@@ -35,32 +35,32 @@ test_names <- list(
 
 # Defines the test-cases
 test_values <- list(
-  list(c( 128,  128, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
+  list(c( 128,  128, 102, 141, 121, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 102, 141, 121, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 102, 141, 121, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 102, 141, 121, 16, 1, num_runs, precision)),
   list(
-    c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
+    c(1024, 1024, 101, 141, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 141, 122, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 142, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 142, 122, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 141, 122, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 142, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 142, 122, 1, 0, num_runs, precision)
   ),
   list(
-    c(   8,    8, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
+    c(   8,    8, 102, 141, 121, 1, 0, num_runs, precision),
+    c(  16,   16, 102, 141, 121, 1, 0, num_runs, precision),
+    c(  32,   32, 102, 141, 121, 1, 0, num_runs, precision),
+    c(  64,   64, 102, 141, 121, 1, 0, num_runs, precision),
+    c( 128,  128, 102, 141, 121, 1, 0, num_runs, precision),
+    c( 256,  256, 102, 141, 121, 1, 0, num_runs, precision),
+    c( 512,  512, 102, 141, 121, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
+    c(2048, 2048, 102, 141, 121, 1, 0, num_runs, precision),
+    c(4096, 4096, 102, 141, 121, 1, 0, num_runs, precision),
+    c(8192, 8192, 102, 141, 121, 1, 0, num_runs, precision)
   )
 )
 
diff --git a/scripts/graphs/xsyrk.r b/scripts/graphs/xsyrk.r
index 4ab46c9f..754c93e2 100644
--- a/scripts/graphs/xsyrk.r
+++ b/scripts/graphs/xsyrk.r
@@ -35,32 +35,32 @@ test_names <- list(
 
 # Defines the test-cases
 test_values <- list(
-  list(c( 128,  128, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 129,  129, 102, 111, 111, 16, 128, num_runs, precision)),
-  list(c( 512,  512, 102, 111, 111, 16, 1, num_runs, precision)),
-  list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
+  list(c( 128,  128, 102, 121, 111, 16, 128, num_runs, precision)),
+  list(c( 129,  129, 102, 121, 111, 16, 128, num_runs, precision)),
+  list(c( 512,  512, 102, 121, 111, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 102, 121, 111, 16, 1, num_runs, precision)),
   list(
-    c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
+    c(1024, 1024, 101, 121, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 121, 112, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 122, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 101, 122, 112, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 121, 112, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 122, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 122, 112, 1, 0, num_runs, precision)
   ),
   list(
-    c(   8,    8, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  16,   16, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  32,   32, 102, 111, 111, 1, 0, num_runs, precision),
-    c(  64,   64, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 128,  128, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 256,  256, 102, 111, 111, 1, 0, num_runs, precision),
-    c( 512,  512, 102, 111, 111, 1, 0, num_runs, precision),
-    c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
-    c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
-    c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
-    c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
+    c(   8,    8, 102, 121, 111, 1, 0, num_runs, precision),
+    c(  16,   16, 102, 121, 111, 1, 0, num_runs, precision),
+    c(  32,   32, 102, 121, 111, 1, 0, num_runs, precision),
+    c(  64,   64, 102, 121, 111, 1, 0, num_runs, precision),
+    c( 128,  128, 102, 121, 111, 1, 0, num_runs, precision),
+    c( 256,  256, 102, 121, 111, 1, 0, num_runs, precision),
+    c( 512,  512, 102, 121, 111, 1, 0, num_runs, precision),
+    c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
+    c(2048, 2048, 102, 121, 111, 1, 0, num_runs, precision),
+    c(4096, 4096, 102, 121, 111, 1, 0, num_runs, precision),
+    c(8192, 8192, 102, 121, 111, 1, 0, num_runs, precision)
   )
 )
 
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index d57223dd..aaa76cb4 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -12,8 +12,8 @@
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
-// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore
-// contains the following header copyright notice:
+// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
+// therefore contains the following header copyright notice:
 //
 // =================================================================================================
 //
@@ -97,14 +97,12 @@ class Event {
   // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
   float GetElapsedTime() const {
     WaitForCompletion();
-    auto bytes = size_t{0};
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
-    auto time_start = size_t{0};
+    const auto bytes = sizeof(cl_ulong);
+    auto time_start = cl_ulong{0};
     clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
-    auto time_end = size_t{0};
+    auto time_end = cl_ulong{0};
     clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
-    return (time_end - time_start) * 1.0e-6f;
+    return static_cast<float>(time_end - time_start) * 1.0e-6f;
   }
 
   // Accessor to the private data-member
@@ -152,6 +150,17 @@ class Platform {
   cl_platform_id platform_;
 };
 
+// Retrieves a vector with all platforms
+inline std::vector<Platform> GetAllPlatforms() {
+  auto num_platforms = cl_uint{0};
+  CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
+  auto all_platforms = std::vector<Platform>();
+  for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
+    all_platforms.push_back(Platform(platform_id));
+  }
+  return all_platforms;
+}
+
 // =================================================================================================
 
 // C++11 version of 'cl_device_id'
@@ -201,8 +210,8 @@ class Device {
   std::vector<size_t> MaxWorkItemSizes() const {
     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
   }
-  cl_ulong LocalMemSize() const {
-    return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
+  unsigned long LocalMemSize() const {
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
   }
   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
   size_t CoreClock() const {
@@ -238,9 +247,11 @@ class Device {
   // Query for a specific type of device or brand
   bool IsCPU() const { return Type() == "CPU"; }
   bool IsGPU() const { return Type() == "GPU"; }
-  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." ||
+                              Vendor() == "AuthenticAMD";; }
   bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
-  bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
+  bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" ||
+                                Vendor() == "GenuineIntel"; }
   bool IsARM() const { return Vendor() == "ARM"; }
 
   // Accessor to the private data-member
@@ -606,8 +617,7 @@ class Buffer {
 
   // Retrieves the actual allocated size in bytes
   size_t GetSize() const {
-    auto bytes = size_t{0};
-    CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
+    const auto bytes = sizeof(size_t);
     auto result = size_t{0};
     CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
     return result;
@@ -658,17 +668,16 @@ class Kernel {
   }
 
   // Retrieves the amount of local memory used per work-group for this kernel
-  cl_ulong LocalMemUsage(const Device &device) const {
-    auto bytes = size_t{0};
+  unsigned long LocalMemUsage(const Device &device) const {
+    const auto bytes = sizeof(cl_ulong);
     auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
-    CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
     auto result = cl_ulong{0};
     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
-    return result;
+    return static_cast<unsigned long>(result);
   }
 
   // Retrieves the name of the kernel
-  std::string GetFunctionName() {
+  std::string GetFunctionName() const {
     auto bytes = size_t{0};
     CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
     auto result = std::string{};
@@ -689,6 +698,7 @@ class Kernel {
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event,
               const std::vector<Event> &waitForEvents) {
+
     // Builds a plain version of the events waiting list
     auto waitForEventsPlain = std::vector<cl_event>();
     for (auto &waitEvent : waitForEvents) {
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 34c44a29..2340a89c 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -21,27 +21,42 @@
 #include "database/kernels/xgemv_fast_rot.hpp"
 #include "database/kernels/xger.hpp"
 #include "database/kernels/xgemm.hpp"
+#include "database/kernels/xgemm_direct.hpp"
 #include "database/kernels/copy.hpp"
 #include "database/kernels/pad.hpp"
 #include "database/kernels/transpose.hpp"
 #include "database/kernels/padtranspose.hpp"
+#include "database/kernel_selection.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // Initializes the database
-const std::vector<Database::DatabaseEntry> Database::database = {
-  XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
-  XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
-  XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
-  XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble,
-  XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
-  XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
-  XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
-  CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
-  PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
-  TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
-  PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
+const std::vector<const Database::DatabaseEntry*> Database::database = {
+  &database::XaxpyHalf, &database::XaxpySingle, &database::XaxpyDouble, &database::XaxpyComplexSingle, &database::XaxpyComplexDouble,
+  &database::XdotHalf, &database::XdotSingle, &database::XdotDouble, &database::XdotComplexSingle, &database::XdotComplexDouble,
+  &database::XgemvHalf, &database::XgemvSingle, &database::XgemvDouble, &database::XgemvComplexSingle, &database::XgemvComplexDouble,
+  &database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
+  &database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
+  &database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
+  &database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
+  &database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
+  &database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
+  &database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
+  &database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
+  &database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
+  &database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
+};
+
+// The OpenCL device vendors
+const std::string Database::kDeviceVendorAll = "default";
+
+// Alternative names for some OpenCL vendors
+const std::unordered_map<std::string, std::string> Database::kVendorNames{
+  { "Intel(R) Corporation", "Intel" },
+  { "GenuineIntel", "Intel" },
+  { "Advanced Micro Devices, Inc.", "AMD" },
+  { "NVIDIA Corporation", "NVIDIA" },
 };
 
 // =================================================================================================
@@ -49,7 +64,7 @@ const std::vector<Database::DatabaseEntry> Database::database = {
 // Constructor, computing device properties and populating the parameter-vector from the database.
 // This takes an optional overlay database in case of custom tuning or custom kernels.
 Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
-                   const Precision precision, const std::vector<DatabaseEntry> &overlay):
+                   const Precision precision, const std::vector<const DatabaseEntry*> &overlay):
   parameters_{} {
 
   // Finds information of the current device
@@ -69,8 +84,8 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
   for (auto &kernel: kernels) {
     auto search_result = ParametersPtr{};
 
-    for (auto db: { &overlay, &database }) {
-      search_result = Search(kernel, device_type, device_vendor, device_name, precision, *db);
+    for (auto &db: { database, overlay}) {
+      search_result = Search(kernel, device_type, device_vendor, device_name, precision, db);
       if (search_result) {
         parameters_.insert(search_result->begin(), search_result->end());
         break;
@@ -100,17 +115,17 @@ Database::ParametersPtr Database::Search(const std::string &this_kernel,
                                          const std::string &this_vendor,
                                          const std::string &this_device,
                                          const Precision this_precision,
-                                         const std::vector<DatabaseEntry> &this_database) const {
+                                         const std::vector<const DatabaseEntry*> &this_database) const {
 
   // Selects the right kernel
   for (auto &db: this_database) {
-    if (db.kernel == this_kernel && db.precision == this_precision) {
+    if (db->kernel == this_kernel && db->precision == this_precision) {
 
       // Searches for the right vendor and device type, or selects the default if unavailable. This
       // assumes that the default vendor / device type is last in the database.
-      for (auto &vendor: db.vendors) {
+      for (auto &vendor: db->vendors) {
         if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
-            (vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
+            (vendor.type == this_type || vendor.type == database::kDeviceTypeAll)) {
 
           // Searches for the right device. If the current device is unavailable, selects the vendor
           // default parameters. This assumes the default is last in the database.
diff --git a/src/database/database.hpp b/src/database/database.hpp
index a6ab49c5..8a3e7040 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -26,6 +26,19 @@
 namespace clblast {
 // =================================================================================================
 
+// A special namespace to hold all the global constant variables (including the database entries)
+namespace database {
+
+  // The OpenCL device types
+  const std::string kDeviceTypeCPU = "CPU";
+  const std::string kDeviceTypeGPU = "GPU";
+  const std::string kDeviceTypeAccelerator = "accelerator";
+  const std::string kDeviceTypeAll = "default";
+
+} // namespace database
+
+// =================================================================================================
+
 // See comment at top of file for a description of the class
 class Database {
  public:
@@ -36,54 +49,32 @@ class Database {
 
   // Structures for content inside the database
   struct DatabaseDevice {
-    const std::string name;
-    const Parameters parameters;
+    std::string name;
+    Parameters parameters;
   };
   struct DatabaseVendor {
-    const std::string type;
-    const std::string name;
-    const std::vector<DatabaseDevice> devices;
+    std::string type;
+    std::string name;
+    std::vector<DatabaseDevice> devices;
   };
   struct DatabaseEntry {
-    const std::string kernel;
-    const Precision precision;
-    const std::vector<DatabaseVendor> vendors;
+    std::string kernel;
+    Precision precision;
+    std::vector<DatabaseVendor> vendors;
   };
 
-  // The OpenCL device types
-  static constexpr auto kDeviceTypeCPU = "CPU";
-  static constexpr auto kDeviceTypeGPU = "GPU";
-  static constexpr auto kDeviceTypeAccelerator = "accelerator";
-  static constexpr auto kDeviceTypeAll = "default";
-
   // The OpenCL device vendors
-  static constexpr auto kDeviceVendorAll = "default";
+  static const std::string kDeviceVendorAll;
 
   // Alternative names for some OpenCL vendors
-  const std::unordered_map<std::string,std::string> kVendorNames {
-    {"Intel(R) Corporation", "Intel"},
-    {"GenuineIntel", "Intel"},
-    {"Advanced Micro Devices, Inc.", "AMD"},
-    {"NVIDIA Corporation", "NVIDIA"},
-  };
+  static const std::unordered_map<std::string, std::string> kVendorNames;
 
   // The database consists of separate database entries, stored together in a vector
-  static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
-  static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
-  static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
-  static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble;
-  static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
-  static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
-  static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
-  static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
-  static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
-  static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
-  static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
-  static const std::vector<DatabaseEntry> database;
+  static const std::vector<const DatabaseEntry*> database;
 
   // The constructor with a user-provided database overlay (potentially an empty vector)
   explicit Database(const Queue &queue, const std::vector<std::string> &routines,
-                    const Precision precision, const std::vector<DatabaseEntry> &overlay);
+                    const Precision precision, const std::vector<const DatabaseEntry*> &overlay);
 
   // Accessor of values by key
   size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
@@ -95,7 +86,8 @@ class Database {
   // Search method for a specified database, returning pointer (possibly a nullptr)
   ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
                        const std::string &this_vendor, const std::string &this_device,
-                       const Precision this_precision, const std::vector<DatabaseEntry> &db) const;
+                       const Precision this_precision,
+                       const std::vector<const DatabaseEntry*> &db) const;
 
   // Found parameters suitable for this device/kernel
   Parameters parameters_;
diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp
new file mode 100644
index 00000000..7e5e7821
--- /dev/null
+++ b/src/database/kernel_selection.hpp
@@ -0,0 +1,131 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel
+// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the
+// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use
+// some common default values.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionHalf = {
+  "KernelSelection", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionSingle = {
+  "KernelSelection", Precision::kSingle, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionComplexSingle = {
+  "KernelSelection", Precision::kComplexSingle, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionDouble = {
+  "KernelSelection", Precision::kDouble, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionComplexDouble = {
+  "KernelSelection", Precision::kComplexDouble, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index a6b7dfe8..16aa6b3f 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::CopyHalf = {
+const Database::DatabaseEntry CopyHalf = {
   "Copy", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::CopyHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::CopySingle = {
+const Database::DatabaseEntry CopySingle = {
   "Copy", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::CopySingle = {
         { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
@@ -84,7 +86,7 @@ const Database::DatabaseEntry Database::CopySingle = {
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::CopySingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::CopyComplexSingle = {
+const Database::DatabaseEntry CopyComplexSingle = {
   "Copy", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -128,6 +130,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
@@ -147,7 +150,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
         { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
@@ -165,7 +168,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::CopyDouble = {
+const Database::DatabaseEntry CopyDouble = {
   "Copy", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -205,13 +208,13 @@ const Database::DatabaseEntry Database::CopyDouble = {
         { "GeForce GTX 670",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
     { // Default
@@ -224,7 +227,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::CopyComplexDouble = {
+const Database::DatabaseEntry CopyComplexDouble = {
   "Copy", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -264,7 +267,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@@ -282,4 +285,5 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index 3cfabaf4..6c5e0c2f 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadHalf = {
+const Database::DatabaseEntry PadHalf = {
   "Pad", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::PadHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadSingle = {
+const Database::DatabaseEntry PadSingle = {
   "Pad", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::PadSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris Pro",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
@@ -84,7 +86,7 @@ const Database::DatabaseEntry Database::PadSingle = {
         { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::PadSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadComplexSingle = {
+const Database::DatabaseEntry PadComplexSingle = {
   "Pad", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -134,10 +136,11 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris Pro",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
     { // Intel accelerators
@@ -154,13 +157,13 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
         { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
@@ -173,7 +176,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadDouble = {
+const Database::DatabaseEntry PadDouble = {
   "Pad", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -232,7 +235,7 @@ const Database::DatabaseEntry Database::PadDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadComplexDouble = {
+const Database::DatabaseEntry PadComplexDouble = {
   "Pad", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -272,7 +275,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
         { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -290,4 +293,5 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index 88bd4ea7..4003ec6d 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadtransposeHalf = {
+const Database::DatabaseEntry PadtransposeHalf = {
   "Padtranspose", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::PadtransposeHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadtransposeSingle = {
+const Database::DatabaseEntry PadtransposeSingle = {
   "Padtranspose", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris",                                            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
+const Database::DatabaseEntry PadtransposeComplexSingle = {
   "Padtranspose", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -134,6 +136,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Iris",                                            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -173,7 +176,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadtransposeDouble = {
+const Database::DatabaseEntry PadtransposeDouble = {
   "Padtranspose", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -232,7 +235,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
+const Database::DatabaseEntry PadtransposeComplexDouble = {
   "Padtranspose", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -272,7 +275,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
         { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
@@ -290,4 +293,5 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 0e1b608e..c5ea50c2 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::TransposeHalf = {
+const Database::DatabaseEntry TransposeHalf = {
   "Transpose", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::TransposeHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::TransposeSingle = {
+const Database::DatabaseEntry TransposeSingle = {
   "Transpose", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
         { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::TransposeComplexSingle = {
+const Database::DatabaseEntry TransposeComplexSingle = {
   "Transpose", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -134,6 +136,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
@@ -159,7 +162,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
       }
     },
   }
@@ -167,7 +170,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::TransposeDouble = {
+const Database::DatabaseEntry TransposeDouble = {
   "Transpose", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -207,7 +210,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
         { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 750",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@@ -226,7 +229,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::TransposeComplexDouble = {
+const Database::DatabaseEntry TransposeComplexDouble = {
   "Transpose", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -278,4 +281,5 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 9c1bcd99..60471bef 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XaxpyHalf = {
+const Database::DatabaseEntry XaxpyHalf = {
   "Xaxpy", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XaxpyHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XaxpySingle = {
+const Database::DatabaseEntry XaxpySingle = {
   "Xaxpy", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",1}, {"WGS",512}, {"WPT",2} } },
         { "Iris",                                            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",128}, {"WPT",2} } },
@@ -84,7 +86,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
         { "GeForce GTX 670",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",4}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XaxpyComplexSingle = {
+const Database::DatabaseEntry XaxpyComplexSingle = {
   "Xaxpy", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -134,6 +136,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW",4}, {"WGS",64}, {"WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",2}, {"WGS",512}, {"WPT",1} } },
         { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",256}, {"WPT",8} } },
@@ -173,7 +176,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XaxpyDouble = {
+const Database::DatabaseEntry XaxpyDouble = {
   "Xaxpy", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -213,7 +216,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
         { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
@@ -232,7 +235,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XaxpyComplexDouble = {
+const Database::DatabaseEntry XaxpyComplexDouble = {
   "Xaxpy", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -272,7 +275,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
         { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",2} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",64}, {"WPT",4} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
@@ -290,4 +293,5 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 987a990d..686b2839 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XdotHalf = {
+const Database::DatabaseEntry XdotHalf = {
   "Xdot", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XdotHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XdotSingle = {
+const Database::DatabaseEntry XdotSingle = {
   "Xdot", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -55,6 +56,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WGS2",32} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",512}, {"WGS2",128} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",512}, {"WGS2",64} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
@@ -68,6 +70,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",1024} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",128} } },
         { "GeForce GTX 750",                                 { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",1024}, {"WGS2",32} } },
@@ -84,7 +87,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XdotComplexSingle = {
+const Database::DatabaseEntry XdotComplexSingle = {
   "Xdot", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -106,6 +109,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",512}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",32}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
@@ -119,6 +123,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
@@ -135,7 +140,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XdotDouble = {
+const Database::DatabaseEntry XdotDouble = {
   "Xdot", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -160,6 +165,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",256} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
@@ -176,7 +182,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XdotComplexDouble = {
+const Database::DatabaseEntry XdotComplexDouble = {
   "Xdot", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -201,6 +207,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",128} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",128}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",128}, {"WGS2",32} } },
@@ -216,4 +223,5 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index d19c55b5..8303fa83 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemmHalf = {
+const Database::DatabaseEntry XgemmHalf = {
   "Xgemm", Precision::kHalf, {
     { // Default
       kDeviceTypeAll, "default", {
@@ -26,7 +27,7 @@ const Database::DatabaseEntry Database::XgemmHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemmSingle = {
+const Database::DatabaseEntry XgemmSingle = {
   "Xgemm", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -57,9 +58,10 @@ const Database::DatabaseEntry Database::XgemmSingle = {
         { "Intel(R) HD Graphics 530",                        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
         { "Iris",                                            { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
-        { "Iris Pro",                                        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
@@ -77,7 +79,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
         { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
         { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",2} } },
         { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
@@ -96,7 +98,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemmComplexSingle = {
+const Database::DatabaseEntry XgemmComplexSingle = {
   "Xgemm", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -127,6 +129,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
         { "Iris",                                            { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
@@ -147,7 +150,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
         { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
@@ -166,7 +169,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemmDouble = {
+const Database::DatabaseEntry XgemmDouble = {
   "Xgemm", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -206,7 +209,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
         { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
         { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -225,7 +228,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemmComplexDouble = {
+const Database::DatabaseEntry XgemmComplexDouble = {
   "Xgemm", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -265,7 +268,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
         { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -282,4 +285,5 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp
new file mode 100644
index 00000000..89499cc6
--- /dev/null
+++ b/src/database/kernels/xgemm_direct.hpp
@@ -0,0 +1,138 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemm_Direct' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectHalf = {
+  "XgemmDirect", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectSingle = {
+  "XgemmDirect", Precision::kSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Iris Pro",                                        { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectComplexSingle = {
+  "XgemmDirect", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Iris Pro",                                        { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectDouble = {
+  "XgemmDirect", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectComplexDouble = {
+  "XgemmDirect", Precision::kComplexDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index e5e8845e..90355b96 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvHalf = {
+const Database::DatabaseEntry XgemvHalf = {
   "Xgemv", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgemvHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvSingle = {
+const Database::DatabaseEntry XgemvSingle = {
   "Xgemv", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -57,6 +58,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WPT1",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
         { "Iris",                                            { {"WGS1",64}, {"WPT1",2} } },
         { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2} } },
@@ -77,7 +79,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",256}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1} } },
@@ -96,7 +98,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvComplexSingle = {
+const Database::DatabaseEntry XgemvComplexSingle = {
   "Xgemv", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -120,6 +122,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
         { "Iris",                                            { {"WGS1",256}, {"WPT1",1} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WPT1",1} } },
@@ -140,7 +143,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",128}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
@@ -155,7 +158,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvDouble = {
+const Database::DatabaseEntry XgemvDouble = {
   "Xgemv", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -188,7 +191,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1} } },
@@ -207,7 +210,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvComplexDouble = {
+const Database::DatabaseEntry XgemvComplexDouble = {
   "Xgemv", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -249,4 +252,5 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 52af628c..8e6254ac 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastHalf = {
+const Database::DatabaseEntry XgemvFastHalf = {
   "XgemvFast", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgemvFastHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastSingle = {
+const Database::DatabaseEntry XgemvFastSingle = {
   "XgemvFast", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -57,10 +58,11 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
         { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
-        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
       }
     },
     { // Intel accelerators
@@ -77,7 +79,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
         { "GeForce GTX 670",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
         { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 750",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
         { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@@ -96,7 +98,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
+const Database::DatabaseEntry XgemvFastComplexSingle = {
   "XgemvFast", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -120,6 +122,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris Pro",                                        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
@@ -139,7 +142,6 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
         { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
       }
     },
@@ -153,7 +155,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastDouble = {
+const Database::DatabaseEntry XgemvFastDouble = {
   "XgemvFast", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -186,7 +188,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
         { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 750",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
         { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
@@ -205,7 +207,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
+const Database::DatabaseEntry XgemvFastComplexDouble = {
   "XgemvFast", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -247,4 +249,5 @@ const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 328094e1..8fe45e01 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastRotHalf = {
+const Database::DatabaseEntry XgemvFastRotHalf = {
   "XgemvFastRot", Precision::kHalf, {
     { // Default
       kDeviceTypeAll, "default", {
@@ -26,7 +27,7 @@ const Database::DatabaseEntry Database::XgemvFastRotHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastRotSingle = {
+const Database::DatabaseEntry XgemvFastRotSingle = {
   "XgemvFastRot", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -44,6 +45,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
         { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
@@ -51,8 +53,9 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
         { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
       }
     },
     { // Default
@@ -65,7 +68,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
+const Database::DatabaseEntry XgemvFastRotComplexSingle = {
   "XgemvFastRot", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -83,6 +86,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
         { "default",                                         { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } },
@@ -98,7 +102,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastRotDouble = {
+const Database::DatabaseEntry XgemvFastRotDouble = {
   "XgemvFastRot", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -114,8 +118,9 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
         { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
       }
     },
     { // Default
@@ -128,7 +133,7 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
+const Database::DatabaseEntry XgemvFastRotComplexDouble = {
   "XgemvFastRot", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -151,4 +156,5 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 3e9c25c1..f2fc2a9a 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgerHalf = {
+const Database::DatabaseEntry XgerHalf = {
   "Xger", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgerHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgerSingle = {
+const Database::DatabaseEntry XgerSingle = {
   "Xger", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -63,6 +64,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
         { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
@@ -76,8 +78,9 @@ const Database::DatabaseEntry Database::XgerSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
       }
     },
     { // Default
@@ -90,7 +93,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgerComplexSingle = {
+const Database::DatabaseEntry XgerComplexSingle = {
   "Xger", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -120,6 +123,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
@@ -133,6 +137,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 750",                                 { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
       }
@@ -147,7 +152,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgerDouble = {
+const Database::DatabaseEntry XgerDouble = {
   "Xger", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -180,8 +185,9 @@ const Database::DatabaseEntry Database::XgerDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
       }
     },
     { // Default
@@ -194,7 +200,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgerComplexDouble = {
+const Database::DatabaseEntry XgerComplexDouble = {
   "Xger", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -227,6 +233,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "default",                                         { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
       }
@@ -240,4 +247,5 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 223501fd..b0817242 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -204,7 +204,7 @@ R"(
 #if PRECISION == 3232 || PRECISION == 6464
   #define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y
 #else
-  #define COMPLEX_CONJUGATE(value) value = value
+  #define COMPLEX_CONJUGATE(value) 
 #endif
 
 // =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl
new file mode 100644
index 00000000..a8bd450e
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part1.opencl
@@ -0,0 +1,273 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any
+// pre and and post-processing kernels.
+//
+// This kernel is seperated into three files. This is part 1 out of 3.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library. Note that all parameters here have a
+// suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel.
+#ifndef WGD
+  #define WGD 8      // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64)
+#endif
+#ifndef MDIMCD
+  #define MDIMCD 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef NDIMCD
+  #define NDIMCD 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef MDIMAD
+  #define MDIMAD 8    // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#endif
+#ifndef NDIMBD
+  #define NDIMBD 8    // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#endif
+#ifndef KWID
+  #define KWID 1      // Unroll factor of the WGD loop (smaller or equal than WGD)
+#endif
+#ifndef VWMD
+  #define VWMD 1      // Vector width of matrices A and C
+#endif
+#ifndef VWND
+  #define VWND 1      // Vector width of matrix B
+#endif
+#ifndef PADA
+  #define PADA 1      // Local memory padding for matrix A
+#endif
+#ifndef PADB
+  #define PADB 1      // Local memory padding for matrix B
+#endif
+
+// Helper parameters based on the above tuning parameters
+#define MWID (WGD/MDIMCD)                // Work per work-item (M-dimension)
+#define NWID (WGD/NDIMCD)                // Work per work-item (N-dimension)
+#define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#define MWAD (WGD/MDIMAD)                // Amount of loads-per-thread for matrix A (M-dimension)
+#define KWAD (WGD/KDIMAD)                // Amount of loads-per-thread for matrix A (K-dimension)
+#define KWBD (WGD/KDIMBD)                // Amount of loads-per-thread for matrix B (K-dimension)
+#define NWBD (WGD/NDIMBD)                // Amount of loads-per-thread for matrix B (N-dimension)
+
+// =================================================================================================
+
+// Data-widths in dimension M
+#if VWMD == 1
+    typedef real realMD;
+#elif VWMD == 2
+    typedef real2 realMD;
+#elif VWMD == 4
+    typedef real4 realMD;
+#elif VWMD == 8
+    typedef real8 realMD;
+#elif VWMD == 16
+    typedef real16 realMD;
+#endif
+
+// Data-widths in dimension N
+#if VWND == 1
+    typedef real realND;
+#elif VWND == 2
+    typedef real2 realND;
+#elif VWND == 4
+    typedef real4 realND;
+#elif VWND == 8
+    typedef real8 realND;
+#elif VWND == 16
+    typedef real16 realND;
+#endif
+
+// =================================================================================================
+
+// Initializes the accumulation registers to zero
+inline void InitAccRegistersDirect(real cpm[NWID][MWID]) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    #pragma unroll
+    for (int ni=0; ni<NWID; ++ni) {
+      SetToZero(cpm[ni][mi]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Performs the actual computation: Cpm += Apm * Bpm
+inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+      MultiplyAdd(cpm[ni][mi], apm[mi], bpm[ni]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix.
+inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID],
+                                   const int a_ld, const int a_offset, const int idm, const int idk,
+                                   const int a_transpose, const int a_conjugate) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+    apm[mi] = agms[a_index + a_offset];
+    if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID],
+                                   const int b_ld, const int b_offset, const int idn, const int idk,
+                                   const int b_transpose, const int b_conjugate) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+    bpm[ni] = bgms[b_index + b_offset];
+    if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+  }
+}
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix. This is the same as above but now includes a bounds check.
+inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID],
+                                    const int a_ld, const int a_offset, const int idm, const int idk,
+                                    const int a_transpose, const int a_conjugate,
+                                    const int kSizeM) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    if (idm + mi < kSizeM) {
+      const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+      apm[mi] = agms[a_index + a_offset];
+      if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+    }
+    else {
+      SetToZero(apm[mi]);
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID],
+                                    const int b_ld, const int b_offset, const int idn, const int idk,
+                                    const int b_transpose, const int b_conjugate,
+                                    const int kSizeN) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    if (idn + ni < kSizeN) {
+      const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+      bpm[ni] = bgms[b_index + b_offset];
+      if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+    }
+    else {
+      SetToZero(bpm[ni]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches on-chip local memory into per-thread private memory (registers). This function is specific
+// for caching the A input matrix.
+inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
+                                  const int a_transpose) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    const int mg = mi + get_local_id(0)*MWID;
+    const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg;
+    apm[mi] = alm[index];
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
+                                  const int b_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    const int ng = ni + get_local_id(1)*NWID;
+    const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng;
+    bpm[ni] = blm[index];
+  }
+}
+
+// =================================================================================================
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
+                               const int idm, const int idn,
+                               const real alpha, const real beta,
+                               const int c_ld, const int c_offset, const int c_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+
+      // Determines the destination index
+      int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+      // The final multiplication with alpha (in case beta == 0)
+      real result;
+      if (IsZero(beta)) {
+        Multiply(result, alpha, cpm[ni][mi]);
+      }
+      // The final multiplication with alpha and the addition with beta*C
+      else {
+        AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+      }
+      cgm[c_index + c_offset] = result;
+    }
+  }
+}
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID],
+                                const int idm, const int idn, const int kSizeM, const int kSizeN,
+                                const real alpha, const real beta,
+                                const int c_ld, const int c_offset, const int c_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+      if ((idm + mi) < kSizeM && (idn + ni) < kSizeN) {
+
+        // Determines the destination index
+        int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+        // The final multiplication with alpha (in case beta == 0)
+        real result;
+        if (IsZero(beta)) {
+          Multiply(result, alpha, cpm[ni][mi]);
+        }
+        // The final multiplication with alpha and the addition with beta*C
+        else {
+          AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+        }
+        cgm[c_index + c_offset] = result;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
new file mode 100644
index 00000000..d77cbf65
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -0,0 +1,314 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix.
+inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
+                                 const int a_ld, const int a_offset, const int kwg,
+                                 const int a_transpose, const int a_conjugate) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD/VWMD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*(MWAD/VWMD);
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(WGD/VWMD);
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
+      #if VWMD == 1
+         alm[kg*(WGD + PADA) + mg] = avec;
+      #elif VWMD == 2
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+      #elif VWMD == 4
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.z;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.w;
+      #elif VWMD == 8
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+         alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+         alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+         alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+         alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+      #elif VWMD == 16
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+         alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+         alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+         alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+         alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+         alm[kg*(WGD + PADA) + mg*VWMD + 8] = avec.s8;
+         alm[kg*(WGD + PADA) + mg*VWMD + 9] = avec.s9;
+         alm[kg*(WGD + PADA) + mg*VWMD + 10] = avec.sA;
+         alm[kg*(WGD + PADA) + mg*VWMD + 11] = avec.sB;
+         alm[kg*(WGD + PADA) + mg*VWMD + 12] = avec.sC;
+         alm[kg*(WGD + PADA) + mg*VWMD + 13] = avec.sD;
+         alm[kg*(WGD + PADA) + mg*VWMD + 14] = avec.sE;
+         alm[kg*(WGD + PADA) + mg*VWMD + 15] = avec.sF;
+      #endif
+      if (a_conjugate) {
+        for (int vm=0; vm<VWMD; ++vm) {
+          COMPLEX_CONJUGATE(alm[kg*(WGD + PADA) + mg*VWMD + vm]);
+        }
+      }
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
+                                 const int b_ld, const int b_offset, const int kwg,
+                                 const int b_transpose, const int b_conjugate) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD/VWND; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*(NWBD/VWND);
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg/VWND : ng + GetGroupID1()*(WGD/VWND);
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
+      #if VWND == 1
+         blm[kg*(WGD + PADB) + ng] = bvec;
+      #elif VWND == 2
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+      #elif VWND == 4
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.z;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.w;
+      #elif VWND == 8
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+         blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+         blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+         blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+         blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+      #elif VWND == 16
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+         blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+         blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+         blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+         blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+         blm[kg*(WGD + PADB) + ng*VWND + 8] = bvec.s8;
+         blm[kg*(WGD + PADB) + ng*VWND + 9] = bvec.s9;
+         blm[kg*(WGD + PADB) + ng*VWND + 10] = bvec.sA;
+         blm[kg*(WGD + PADB) + ng*VWND + 11] = bvec.sB;
+         blm[kg*(WGD + PADB) + ng*VWND + 12] = bvec.sC;
+         blm[kg*(WGD + PADB) + ng*VWND + 13] = bvec.sD;
+         blm[kg*(WGD + PADB) + ng*VWND + 14] = bvec.sE;
+         blm[kg*(WGD + PADB) + ng*VWND + 15] = bvec.sF;
+      #endif
+      if (b_conjugate) {
+        for (int vn=0; vn<VWND; ++vn) {
+          COMPLEX_CONJUGATE(blm[kg*(WGD + PADB) + ng*VWND + vn]);
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs doesn't
+// use the vector data-types.
+inline void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
+                                 const int a_ld, const int a_offset, const int kwg,
+                                 const int a_transpose, const int a_conjugate) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*MWAD;
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      real result = agms[idk*a_ld + idm + a_offset];
+      if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+      alm[kg*(WGD + PADA) + mg] = result;
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
+                                 const int b_ld, const int b_offset, const int kwg,
+                                 const int b_transpose, const int b_conjugate) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*NWBD;
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      real result = bgms[idk*b_ld + idn + b_offset];
+      if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+      blm[kg*(WGD + PADB) + ng] = result;
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs bounds
+// checks and doesn't use the vector data-types.
+inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
+                                  const int a_ld, const int a_offset, const int kwg,
+                                  const int a_transpose, const int a_conjugate,
+                                  const int kSizeM, const int kSizeK) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*MWAD;
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      int condition = (a_transpose) ? idm < kSizeK : idm < kSizeM;
+      if (condition) {
+        real result = agms[idk*a_ld + idm + a_offset];
+        if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+        alm[kg*(WGD + PADA) + mg] = result;
+      }
+      else {
+        SetToZero(alm[kg*(WGD + PADA) + mg]);
+      }
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
+                                  const int b_ld, const int b_offset, const int kwg,
+                                  const int b_transpose, const int b_conjugate,
+                                  const int kSizeN, const int kSizeK) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*NWBD;
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      int condition = (b_transpose) ? idn < kSizeK : idn < kSizeN;
+      if (condition) {
+        real result = bgms[idk*b_ld + idn + b_offset];
+        if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+        blm[kg*(WGD + PADB) + ng] = result;
+      }
+      else {
+        SetToZero(blm[kg*(WGD + PADB) + ng]);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
new file mode 100644
index 00000000..a9350e00
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -0,0 +1,214 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main body of the kernel. This is the direct version without pre/post processing and restrictions.
+inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
+                        const real_arg arg_alpha,
+                        const real_arg arg_beta,
+                        const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                        const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                        __global real* cgm, const int c_offset, const int c_ld,
+                        __local real* alm, __local real* blm,
+                        const int a_transpose, const int b_transpose, const int c_transpose,
+                        const int a_conjugate, const int b_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Extra pointers to scalar versions of global memory
+  const __global real* restrict agms = (const __global real* restrict) agm;
+  const __global real* restrict bgms = (const __global real* restrict) bgm;
+
+  // Allocates workitem-private memory (registers)
+  real apm[MWID];
+  real bpm[NWID];
+  real cpm[NWID][MWID];
+
+  // Initializes the accumulation registers
+  InitAccRegistersDirect(cpm);
+
+  // The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section
+  // processes only the main parts: output blocks of WGD by WGD.
+  const int idm = get_local_id(0) * MWID + GetGroupID0() * WGD;
+  const int idn = get_local_id(1) * NWID + GetGroupID1() * WGD;
+  if ((idm < (kSizeM/WGD)*WGD) && (idn < (kSizeN/WGD)*WGD)) {
+
+    // Loops over all complete workgroup tiles (K-dimension)
+    int kwg = 0;
+    for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+      // Loads data: off-chip --> local (matrix A and B)
+      if (a_ld % VWMD == 0) {
+        GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+      }
+      else {
+        GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+      }
+      if (b_ld % VWND == 0) {
+        GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+      }
+      else {
+        GlobalToLocalScalarB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      // Loops over all workitem tiles, unrolled by a factor KWID
+      for (int pwi=0; pwi<WGD; pwi+=KWID) {
+        #pragma unroll
+        for (int pit=0; pit<KWID; ++pit) {
+          int kg = pwi + pit;
+
+          // Loads data: local --> private (matrix A and B)
+          LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+          LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+          // Performs the accumulation (Cpm += Apm * Bpm)
+          MultiplyAccumulateDirect(cpm, apm, bpm);
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Loop over the remaining part (incomplete tile in K-dimension)
+    for (; kwg < kSizeK; ++kwg) {
+
+      // Loads data: off-chip --> private (matrix A and B)
+      GlobalToPrivateDirectA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate);
+      GlobalToPrivateDirectB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate);
+
+      // Performs the accumulation (Cpm += Apm * Bpm)
+      MultiplyAccumulateDirect(cpm, apm, bpm);
+    }
+
+    // Stores a tile of results and performs the multiplication with alpha and beta
+    StoreResultsDirect(cgm, cpm, idm, idn, alpha, beta, c_ld, c_offset, c_transpose);
+  }
+
+  // Simple but slower version for the parts on the edge (incomplete tiles in M and N-dimensions)
+  else {
+
+    // Loops over all complete workgroup tiles (K-dimension)
+    int kwg = 0;
+    for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+      // Loads data: off-chip --> local (matrix A and B)
+      GlobalToLocalCheckedA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate, kSizeM, kSizeK);
+      GlobalToLocalCheckedB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate, kSizeN, kSizeK);
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      // Loops over all workitem tiles, unrolled by a factor KWID
+      for (int pwi=0; pwi<WGD; pwi+=KWID) {
+        #pragma unroll
+        for (int pit=0; pit<KWID; ++pit) {
+          int kg = pwi + pit;
+
+          // Loads data: local --> private (matrix A and B)
+          LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+          LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+          // Performs the accumulation (Cpm += Apm * Bpm)
+          MultiplyAccumulateDirect(cpm, apm, bpm);
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Loop over the remaining part (incomplete tile in K-dimension)
+    for (; kwg < kSizeK; ++kwg) {
+
+      // Loads data: off-chip --> private (matrix A and B)
+      GlobalToPrivateCheckedA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM);
+      GlobalToPrivateCheckedB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN);
+
+      // Performs the accumulation (Cpm += Apm * Bpm)
+      MultiplyAccumulateDirect(cpm, apm, bpm);
+    }
+
+    // Stores a tile of results and performs the multiplication with alpha and beta
+    StoreResultsChecked(cgm, cpm, idm, idn, kSizeM, kSizeN, alpha, beta, c_ld, c_offset, c_transpose);
+  }
+}
+
+// =================================================================================================
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/msvc.hpp b/src/msvc.hpp
new file mode 100644
index 00000000..a45105df
--- /dev/null
+++ b/src/msvc.hpp
@@ -0,0 +1,39 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides macro's and definitions to make compilation work on Microsoft Visual Studio,
+// in particular for versions older than 2015 with limited C++11 support.
+// MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015)
+// MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013)
+// MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012)
+// MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010)
+// MSVC++ 9.0  _MSC_VER == 1500 (Visual Studio 2008)
+//
+// =================================================================================================
+
+#ifndef CLBLAST_MSVC_HPP_
+#define CLBLAST_MSVC_HPP_
+
+namespace clblast {
+// =================================================================================================
+#ifdef _MSC_VER
+
+// No support for constexpr prior to 2015. Note that this only works with constants, not with
+// constexpr functions (unused in this project).
+#if _MSC_VER < 1900
+#define constexpr const
+#endif
+
+// _MSC_VER
+#endif
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_MSVC_HPP_
+#endif
diff --git a/src/routine.cpp b/src/routine.cpp
index 189ae190..80764b74 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -14,6 +14,7 @@
 #include <string>
 #include <vector>
 #include <chrono>
+#include <cstdlib>
 
 #include "routine.hpp"
 
@@ -23,7 +24,7 @@ namespace clblast {
 // Constructor: not much here, because no status codes can be returned
 Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
                  const std::vector<std::string> &routines, const Precision precision,
-                 const std::vector<Database::DatabaseEntry> &userDatabase):
+                 const std::vector<const Database::DatabaseEntry*> &userDatabase):
     precision_(precision),
     routine_name_(name),
     queue_(queue),
@@ -42,13 +43,19 @@ StatusCode Routine::SetUp() {
   // Queries the cache to see whether or not the program (context-specific) is already there
   if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
 
+  // Sets the build options from an environmental variable (if set)
+  auto options = std::vector<std::string>();
+  const auto environment_variable = std::getenv("CLBLAST_BUILD_OPTIONS");
+  if (environment_variable != nullptr) {
+    options.push_back(std::string(environment_variable));
+  }
+
   // Queries the cache to see whether or not the binary (device-specific) is already there. If it
   // is, a program is created and stored in the cache
   if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
     try {
       auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
       auto program = Program(device_, context_, binary);
-      auto options = std::vector<std::string>();
       program.Build(device_, options);
       StoreProgramToCache(program, context_, precision_, routine_name_);
     } catch (...) { return StatusCode::kBuildProgramFailure; }
@@ -115,7 +122,6 @@ StatusCode Routine::SetUp() {
   // Compiles the kernel
   try {
     auto program = Program(context_, source_string);
-    auto options = std::vector<std::string>();
     const auto build_status = program.Build(device_, options);
 
     // Checks for compiler crashes/errors/warnings
diff --git a/src/routine.hpp b/src/routine.hpp
index f5c607af..8582a2b7 100644
--- a/src/routine.hpp
+++ b/src/routine.hpp
@@ -36,7 +36,7 @@ class Routine {
   // built-in database.
   explicit Routine(Queue &queue, EventPointer event, const std::string &name,
                    const std::vector<std::string> &routines, const Precision precision,
-                   const std::vector<Database::DatabaseEntry> &userDatabase = {});
+                   const std::vector<const Database::DatabaseEntry*> &userDatabase = {});
 
   // Set-up phase of the kernel
   StatusCode SetUp();
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 0b8e768f..1602c69f 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -22,7 +22,9 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
+    Routine(queue, event, name,
+            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+            PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level3/level3.opencl"
     #include "../../kernels/level3/copy_fast.opencl"
@@ -32,10 +34,16 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/convert_symmetric.opencl"
     #include "../../kernels/level3/convert_triangular.opencl"
     #include "../../kernels/level3/convert_hermitian.opencl"
+    #include "../../kernels/level3/xgemm_direct_part1.opencl"
+    #include "../../kernels/level3/xgemm_direct_part2.opencl"
+    #include "../../kernels/level3/xgemm_direct_part3.opencl"
+  ;
+  auto source_string_part_2 = // separated in two parts to prevent C1091 in MSVC 2013
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
     #include "../../kernels/level3/xgemm_part3.opencl"
   ;
+  source_string_ += source_string_part_2;
 }
 
 // =================================================================================================
@@ -98,6 +106,44 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
   if (ErrorIn(status)) { return status; }
 
+  // Selects which version of GEMM to run
+  const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
+  if (do_gemm_direct) { // for small sizes (single kernel)
+    return GemmDirect(m, n, k, alpha,
+                      a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+                      c_buffer, c_offset, c_ld,
+                      a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
+  }
+  else { // for larger sizes (pre/post-processing plus a very fast kernel)
+    return GemmIndirect(m, n, k, alpha,
+                        a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+                        c_buffer, c_offset, c_ld,
+                        a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
+                        a_one, a_two, a_want_rotated,
+                        b_one, b_two, b_want_rotated,
+                        c_one, c_two, c_want_rotated);
+  }
+}
+
+// =================================================================================================
+
+// The indirect version of GEMM. This uses the faster but non-general kernel. It has specific
+// requirements, but several pre and post-processing kernels take care of those. However, the
+// overhead of these extra kernels might not be ideal for certain devices/arguments.
+template <typename T>
+StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
+                                  const T alpha,
+                                  const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                                  const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                                  const T beta,
+                                  const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                                  const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                                  const bool a_conjugate, const bool b_conjugate,
+                                  const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                                  const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                                  const size_t c_one, const size_t c_two, const bool c_want_rotated) {
+  auto status = StatusCode::kSuccess;
+
   // Calculates the ceiled versions of m, n, and k
   const auto m_ceiled = Ceil(m, db_["MWG"]);
   const auto n_ceiled = Ceil(n, db_["NWG"]);
@@ -217,6 +263,66 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   } catch (...) { return StatusCode::kTempBufferAllocFailure; }
 }
 
+
+// =================================================================================================
+
+// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
+template <typename T>
+StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
+                                const T alpha,
+                                const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                                const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                                const T beta,
+                                const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                                const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                                const bool a_conjugate, const bool b_conjugate) {
+
+  // Loads the program from the database
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+  // Retrieves the proper XgemmDirect kernel from the compiled binary
+  try {
+    const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
+                                         (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
+    auto kernel = Kernel(program, name);
+
+    // Sets the kernel arguments
+    kernel.SetArgument(0, static_cast<int>(m));
+    kernel.SetArgument(1, static_cast<int>(n));
+    kernel.SetArgument(2, static_cast<int>(k));
+    kernel.SetArgument(3, GetRealArg(alpha));
+    kernel.SetArgument(4, GetRealArg(beta));
+    kernel.SetArgument(5, a_buffer());
+    kernel.SetArgument(6, static_cast<int>(a_offset));
+    kernel.SetArgument(7, static_cast<int>(a_ld));
+    kernel.SetArgument(8, b_buffer());
+    kernel.SetArgument(9, static_cast<int>(b_offset));
+    kernel.SetArgument(10, static_cast<int>(b_ld));
+    kernel.SetArgument(11, c_buffer());
+    kernel.SetArgument(12, static_cast<int>(c_offset));
+    kernel.SetArgument(13, static_cast<int>(c_ld));
+    kernel.SetArgument(14, static_cast<int>(c_do_transpose));
+    kernel.SetArgument(15, static_cast<int>(a_conjugate));
+    kernel.SetArgument(16, static_cast<int>(b_conjugate));
+
+    // Computes the global and local thread sizes
+    const auto m_ceiled = Ceil(m, db_["WGD"]);
+    const auto n_ceiled = Ceil(n, db_["WGD"]);
+    const auto global = std::vector<size_t>{
+      (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
+      (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
+    };
+    const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
+
+    // Launches the kernel
+    auto status = RunKernel(kernel, queue_, device_, global, local, event_);
+    if (ErrorIn(status)) { return status; }
+
+    // Successfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
 // =================================================================================================
 
 // Compiles the templated class
diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp
index bc51c7f5..46e12453 100644
--- a/src/routines/level3/xgemm.hpp
+++ b/src/routines/level3/xgemm.hpp
@@ -35,6 +35,29 @@ class Xgemm: public Routine {
                     const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
                     const T beta,
                     const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+
+  // Indirect version of GEMM (with pre and post-processing kernels)
+  StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k,
+                          const T alpha,
+                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                          const T beta,
+                          const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                          const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                          const bool a_conjugate, const bool b_conjugate,
+                          const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                          const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                          const size_t c_one, const size_t c_two, const bool c_want_rotated);
+
+  // Direct version of GEMM (no pre and post-processing kernels)
+  StatusCode GemmDirect(const size_t m, const size_t n, const size_t k,
+                        const T alpha,
+                        const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                        const T beta,
+                        const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                        const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                        const bool a_conjugate, const bool b_conjugate);
 };
 
 // =================================================================================================
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 78ded56e..c57aab39 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -47,6 +47,7 @@ class TuneCopy {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 90f5ea82..9486ee8d 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -47,6 +47,7 @@ class TunePad {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 10fa80cb..2d9d5e49 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -47,6 +47,7 @@ class TuneTranspose {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index 507718eb..d364dabe 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -47,6 +47,7 @@ class TunePadTranspose {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 0033b3c6..403ee9e4 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -51,6 +51,7 @@ class TuneXaxpy {
   static size_t DefaultN() { return 4096*1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp
index 1581e13f..f8416761 100644
--- a/src/tuning/kernels/xdot.cpp
+++ b/src/tuning/kernels/xdot.cpp
@@ -47,6 +47,7 @@ class TuneXdot {
   static size_t DefaultN() { return 2*1024*1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 4cb7fd00..0eb1875b 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -52,6 +52,7 @@ class TuneXgemm {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1024; }
   static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -126,10 +127,10 @@ class TuneXgemm {
   // Sets the local memory size
   static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
     auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision));
+      return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision));
     };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM",
-                                                    "SB", "KWG", "NWG", "VWN"});
+    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG",
+                                                    "SB", "KWG", "NWG"});
   }
 
   // Sets the base thread configuration
diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp
new file mode 100644
index 00000000..204e0be4
--- /dev/null
+++ b/src/tuning/kernels/xgemm_direct.cpp
@@ -0,0 +1,196 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations:
+// - V==1: This tests some limited set of tuning parameters exhaustively.
+// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, int V>
+class TuneXgemmDirect {
+ public:
+
+  // The representative kernel and the source code
+  static std::string KernelFamily() { return (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; }
+  static std::string KernelName() { return "XgemmDirectTN"; }
+  static std::string GetSources() {
+    return
+      #include "../src/kernels/common.opencl"
+      #include "../src/kernels/level3/xgemm_direct_part1.opencl"
+      #include "../src/kernels/level3/xgemm_direct_part2.opencl"
+      #include "../src/kernels/level3/xgemm_direct_part3.opencl"
+    ;
+  }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction};
+  }
+
+  // Tests for valid arguments
+  static void TestValidArguments(const Arguments<T> &) { }
+
+  // Sets the default values for the arguments
+  static size_t DefaultM() { return 256; }
+  static size_t DefaultN() { return 256; }
+  static size_t DefaultK() { return 256; }
+  static double DefaultFraction() { return (V==1) ? 1.0 : 32.0; } // test all or sample randomly
+  static size_t DefaultNumRuns() { return 4; } // run every kernel this many times for averaging
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.k; }
+  static size_t GetSizeB(const Arguments<T> &args) { return args.n * args.k; }
+  static size_t GetSizeC(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
+
+  // Sets the tuning parameters and their possible values
+  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
+    if (V==1) { // limited subset of tuning parameters - but explorable exhaustively
+      tuner.AddParameter(id, "WGD", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMAD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMBD", {8, 16, 32});
+      tuner.AddParameter(id, "KWID", {2});
+      tuner.AddParameter(id, "VWMD", {1, 2, 4, 8});
+      tuner.AddParameter(id, "VWND", {1, 2, 4, 8});
+      tuner.AddParameter(id, "PADA", {1});
+      tuner.AddParameter(id, "PADB", {1});
+    } // a lot more tuning parameters - has to be sampled randomly, too much to test all
+    else {
+      tuner.AddParameter(id, "WGD", {8, 16, 32, 64, 128});
+      tuner.AddParameter(id, "MDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMAD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMBD", {8, 16, 32});
+      tuner.AddParameter(id, "KWID", {2, 8, 16});
+      tuner.AddParameter(id, "VWMD", {1, 2, 4, 8});
+      tuner.AddParameter(id, "VWND", {1, 2, 4, 8});
+      tuner.AddParameter(id, "PADA", {0, 1});
+      tuner.AddParameter(id, "PADB", {0, 1});
+    }
+  }
+
+  // Sets the constraints
+  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+    auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
+    auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
+    auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
+    // Requirement for unrolling the WGD loop
+    tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"});
+    // Required for integer MWID and NWID
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"});
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"});
+    // Required for integer MWIAD and NWIBD
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"});
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"});
+    // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...)
+    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"});
+    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"});
+
+    // Extra constraints for variation 1 to limit the set of options significantly
+    if (V==1) {
+      auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
+      tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"});
+      tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"});
+    }
+  }
+
+  // Sets the local memory size
+  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
+    auto LocalMemorySize = [args] (std::vector<size_t> v) {
+      return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision));
+    };
+    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"});
+  }
+
+  // Sets the base thread configuration
+  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
+  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
+  static std::vector<size_t> LocalSize() { return {1, 1}; }
+  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
+
+  // Transforms the thread configuration based on the parameters
+  using TransformVector = std::vector<std::vector<std::string>>;
+  static TransformVector MulLocal() { return {{"MDIMCD", "NDIMCD"}}; }
+  static TransformVector DivLocal() { return {}; }
+  static TransformVector MulGlobal() { return {{"MDIMCD", "NDIMCD"}}; }
+  static TransformVector DivGlobal() { return {{"WGD", "WGD"}}; }
+
+  // Sets the kernel's arguments
+  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
+                           std::vector<T> &, std::vector<T> &,
+                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
+                           std::vector<T> &) {
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(static_cast<int>(args.n));
+    tuner.AddArgumentScalar(static_cast<int>(args.k));
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+    tuner.AddArgumentScalar(GetRealArg(args.beta));
+    tuner.AddArgumentInput(a_mat);
+    tuner.AddArgumentScalar(0); // a_offset
+    tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld
+    tuner.AddArgumentInput(b_mat);
+    tuner.AddArgumentScalar(0); // b_offset
+    tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld
+    tuner.AddArgumentOutput(c_mat);
+    tuner.AddArgumentScalar(0); // c_offset
+    tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld
+    tuner.AddArgumentScalar(1); // c_do_transpose
+    tuner.AddArgumentScalar(0); // a_conjugate
+    tuner.AddArgumentScalar(0); // b_conjugate
+  }
+
+  // Describes how to compute the performance metrics
+  static size_t GetMetric(const Arguments<T> &args) {
+    return 2 * args.m * args.n * args.k;
+  }
+  static std::string PerformanceUnit() { return "GFLOPS"; }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Function to tune a specific variation V (not within the clblast namespace)
+template <int V>
+void StartVariation(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemmDirect<half,V>, half>(argc, argv); break;
+    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemmDirect<float,V>, float>(argc, argv); break;
+    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemmDirect<double,V>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemmDirect<float2,V>, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemmDirect<double2,V>, double2>(argc, argv); break;
+  }
+}
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  StartVariation<1>(argc, argv);
+  StartVariation<2>(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index 7229602d..f332f52a 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -50,6 +50,7 @@ class TuneXgemv {
   static size_t DefaultN() { return 2048; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index 1fb5c531..c3d0c7dd 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -47,6 +47,7 @@ class TuneXger {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.m; }
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 19df5f9a..afb092bc 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -30,6 +30,7 @@ namespace clblast {
 // that it is automatically compiled for the various kernels (given as the 'C' template argument).
 template <typename C, typename T>
 void Tuner(int argc, char* argv[]) {
+  constexpr auto kSeed = 42; // fixed seed for reproducibility
 
   // Sets the parameters and platform/device for which to tune (command-line options)
   auto help = std::string{"* Options given/available:\n"};
@@ -45,6 +46,8 @@ void Tuner(int argc, char* argv[]) {
     if (o == kArgBeta)     { args.beta     = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
     if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); }
   }
+  const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, C::DefaultNumRuns());
+
   fprintf(stdout, "%s\n", help.c_str());
 
   // Tests validity of the given arguments
@@ -73,12 +76,12 @@ void Tuner(int argc, char* argv[]) {
   auto b_mat = std::vector<T>(C::GetSizeB(args));
   auto c_mat = std::vector<T>(C::GetSizeC(args));
   auto temp = std::vector<T>(C::GetSizeTemp(args));
-  PopulateVector(x_vec);
-  PopulateVector(y_vec);
-  PopulateVector(a_mat);
-  PopulateVector(b_mat);
-  PopulateVector(c_mat);
-  PopulateVector(temp);
+  PopulateVector(x_vec, kSeed);
+  PopulateVector(y_vec, kSeed);
+  PopulateVector(a_mat, kSeed);
+  PopulateVector(b_mat, kSeed);
+  PopulateVector(c_mat, kSeed);
+  PopulateVector(temp, kSeed);
 
   // Initializes the tuner for the chosen device
   cltune::Tuner tuner(args.platform_id, args.device_id);
@@ -126,6 +129,7 @@ void Tuner(int argc, char* argv[]) {
   C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
 
   // Starts the tuning process
+  tuner.SetNumRuns(num_runs);
   tuner.Tune();
 
   // Prints the results to screen
@@ -134,7 +138,7 @@ void Tuner(int argc, char* argv[]) {
 
   // Also prints the performance of the best-case in terms of GB/s or GFLOPS
   if (time_ms != 0.0) {
-    printf("[ -------> ] %.1lf ms", time_ms);
+    printf("[ -------> ] %.2lf ms", time_ms);
     printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
   }
 
diff --git a/src/utilities.cpp b/src/utilities.cpp
index 77bc72d7..86cc2d13 100644
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -270,40 +270,40 @@ unsigned int GetRandomSeed() {
 
 // Create a random number generator and populates a vector with samples from a random distribution
 template <typename T>
-void PopulateVector(std::vector<T> &vector) {
+void PopulateVector(std::vector<T> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<T>(kTestDataLowerLimit);
   auto upper_limit = static_cast<T>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<T> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element = dist(mt); }
 }
-template void PopulateVector<float>(std::vector<float>&);
-template void PopulateVector<double>(std::vector<double>&);
+template void PopulateVector<float>(std::vector<float>&, const unsigned int);
+template void PopulateVector<double>(std::vector<double>&, const unsigned int);
 
 // Specialized versions of the above for complex data-types
 template <>
-void PopulateVector(std::vector<float2> &vector) {
+void PopulateVector(std::vector<float2> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<float>(kTestDataLowerLimit);
   auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
 }
 template <>
-void PopulateVector(std::vector<double2> &vector) {
+void PopulateVector(std::vector<double2> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<double>(kTestDataLowerLimit);
   auto upper_limit = static_cast<double>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<double> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
 }
 
 // Specialized versions of the above for half-precision
 template <>
-void PopulateVector(std::vector<half> &vector) {
+void PopulateVector(std::vector<half> &vector, const unsigned int seed) {
   const auto lower_limit = static_cast<float>(kTestDataLowerLimit);
   const auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element = FloatToHalf(dist(mt)); }
 }
diff --git a/src/utilities.hpp b/src/utilities.hpp
index 75bd5a69..038a8a96 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -25,6 +25,8 @@
 #include "clblast_half.h"
 #include "clpp11.hpp"
 
+#include "msvc.hpp"
+
 namespace clblast {
 // =================================================================================================
 
@@ -206,7 +208,7 @@ bool CheckArgument(const int argc, char *argv[], std::string &help, const std::s
 // =================================================================================================
 
 // Helper function to check for errors in the status code
-constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
+inline bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
 
 // =================================================================================================
 
@@ -219,7 +221,7 @@ constexpr auto kTestDataUpperLimit = 2.0;
 
 // Populates a vector with random data
 template <typename T>
-void PopulateVector(std::vector<T> &vector);
+void PopulateVector(std::vector<T> &vector, const unsigned int seed);
 
 // =================================================================================================
 
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 2e751255..ce3b0e07 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -19,7 +19,24 @@
 namespace clblast {
 // =================================================================================================
 
-// The transpose-options to test with (data-type dependent)
+// Test settings for the regular test. Append to these lists in case more tests are required.
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 4096 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kIncrements = { 1, 2, 7 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixDims = { 7, 64 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 512 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBandSizes = { 4, 19 };
+
+// Test settings for the invalid tests
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kInvalidIncrements = { 0, 1 };
+template <typename T, typename U> const size_t TestBlas<T,U>::kBufferSize = 64;
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVecSizes = {0, kBufferSize - 1, kBufferSize};
+
+// The layout/transpose/triangle options to test with
+template <typename T, typename U> const std::vector<Layout> TestBlas<T,U>::kLayouts = {Layout::kRowMajor, Layout::kColMajor};
+template <typename T, typename U> const std::vector<Triangle> TestBlas<T,U>::kTriangles = {Triangle::kUpper, Triangle::kLower};
+template <typename T, typename U> const std::vector<Side> TestBlas<T,U>::kSides = {Side::kLeft, Side::kRight};
+template <typename T, typename U> const std::vector<Diagonal> TestBlas<T,U>::kDiagonals = {Diagonal::kUnit, Diagonal::kNonUnit};
 template <> const std::vector<Transpose> TestBlas<half,half>::kTransposes = {Transpose::kNo, Transpose::kYes};
 template <> const std::vector<Transpose> TestBlas<float,float>::kTransposes = {Transpose::kNo, Transpose::kYes};
 template <> const std::vector<Transpose> TestBlas<double,double>::kTransposes = {Transpose::kNo, Transpose::kYes};
@@ -39,6 +56,9 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
                         const ResultGet get_result, const ResultIndex get_index,
                         const ResultIterator get_id1, const ResultIterator get_id2):
     Tester<T,U>(argc, argv, silent, name, options),
+    kOffsets(GetOffsets()),
+    kAlphaValues(GetExampleScalars<U>(full_test_)),
+    kBetaValues(GetExampleScalars<U>(full_test_)),
     run_routine_(run_routine),
     get_result_(get_result),
     get_index_(get_index),
@@ -66,13 +86,13 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
   c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
   ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
   scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset);
-  PopulateVector(x_source_);
-  PopulateVector(y_source_);
-  PopulateVector(a_source_);
-  PopulateVector(b_source_);
-  PopulateVector(c_source_);
-  PopulateVector(ap_source_);
-  PopulateVector(scalar_source_);
+  PopulateVector(x_source_, kSeed);
+  PopulateVector(y_source_, kSeed);
+  PopulateVector(a_source_, kSeed);
+  PopulateVector(b_source_, kSeed);
+  PopulateVector(c_source_, kSeed);
+  PopulateVector(ap_source_, kSeed);
+  PopulateVector(scalar_source_, kSeed);
 }
 
 // ===============================================================================================
diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp
index d01cd06c..da572e01 100644
--- a/test/correctness/testblas.hpp
+++ b/test/correctness/testblas.hpp
@@ -30,6 +30,7 @@ namespace clblast {
 template <typename T, typename U>
 class TestBlas: public Tester<T,U> {
  public:
+  static constexpr auto kSeed = 42; // fixed seed for reproducibility
 
   // Uses several variables from the Tester class
   using Tester<T,U>::context_;
@@ -50,26 +51,26 @@ class TestBlas: public Tester<T,U> {
   using Tester<T,U>::GetSizesString;
 
   // Test settings for the regular test. Append to these lists in case more tests are required.
-  const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
-  const std::vector<size_t> kIncrements = { 1, 2, 7 };
-  const std::vector<size_t> kMatrixDims = { 7, 64 };
-  const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
-  const std::vector<size_t> kBandSizes = { 4, 19 };
-  const std::vector<size_t> kOffsets = GetOffsets();
-  const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_);
-  const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_);
+  static const std::vector<size_t> kVectorDims;
+  static const std::vector<size_t> kIncrements;
+  static const std::vector<size_t> kMatrixDims;
+  static const std::vector<size_t> kMatrixVectorDims;
+  static const std::vector<size_t> kBandSizes;
+  const std::vector<size_t> kOffsets;
+  const std::vector<U> kAlphaValues;
+  const std::vector<U> kBetaValues;
 
   // Test settings for the invalid tests
-  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
-  const size_t kBufferSize = 64;
-  const std::vector<size_t> kMatSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
-  const std::vector<size_t> kVecSizes = {0, kBufferSize - 1, kBufferSize};
+  static const std::vector<size_t> kInvalidIncrements;
+  static const size_t kBufferSize;
+  static const std::vector<size_t> kMatSizes;
+  static const std::vector<size_t> kVecSizes;
 
   // The layout/transpose/triangle options to test with
-  const std::vector<Layout> kLayouts = {Layout::kRowMajor, Layout::kColMajor};
-  const std::vector<Triangle> kTriangles = {Triangle::kUpper, Triangle::kLower};
-  const std::vector<Side> kSides = {Side::kLeft, Side::kRight};
-  const std::vector<Diagonal> kDiagonals = {Diagonal::kUnit, Diagonal::kNonUnit};
+  static const std::vector<Layout> kLayouts;
+  static const std::vector<Triangle> kTriangles;
+  static const std::vector<Side> kSides;
+  static const std::vector<Diagonal> kDiagonals;
   static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
 
   // Shorthand for the routine-specific functions passed to the tester
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index 362c5c2c..d25d704c 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -22,6 +22,30 @@
 namespace clblast {
 // =================================================================================================
 
+// Maximum number of test results printed on a single line
+template <typename T, typename U> const size_t Tester<T,U>::kResultsPerLine = size_t{64};
+
+// Error percentage is not applicable: error was caused by an incorrect status
+template <typename T, typename U> const float Tester<T,U>::kStatusError = -1.0f;
+
+// Constants holding start and end strings for terminal-output in colour
+template <typename T, typename U> const std::string Tester<T,U>::kPrintError = "\x1b[31m";
+template <typename T, typename U> const std::string Tester<T,U>::kPrintSuccess = "\x1b[32m";
+template <typename T, typename U> const std::string Tester<T,U>::kPrintWarning = "\x1b[35m";
+template <typename T, typename U> const std::string Tester<T,U>::kPrintMessage = "\x1b[1m";
+template <typename T, typename U> const std::string Tester<T,U>::kPrintEnd = "\x1b[0m";
+
+// Sets the output error coding
+template <typename T, typename U> const std::string Tester<T,U>::kSuccessData = kPrintSuccess + ":" + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kSuccessStatus = kPrintSuccess + "." + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kErrorData = kPrintError + "X" + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kErrorStatus = kPrintError + "/" + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kSkippedCompilation = kPrintWarning + "\\" + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kUnsupportedPrecision = kPrintWarning + "o" + kPrintEnd;
+template <typename T, typename U> const std::string Tester<T,U>::kUnsupportedReference = kPrintWarning + "-" + kPrintEnd;
+
+// =================================================================================================
+
 // General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up
 // the clBLAS library for reference.
 template <typename T, typename U>
@@ -41,8 +65,8 @@ Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
     print_count_{0},
     tests_passed_{0},
     tests_skipped_{0},
-    tests_failed_{0},
-    options_{options} {
+    tests_failed_{0} {
+  options_ = options;
 
   // Determines which reference to test against
   #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS)
@@ -358,28 +382,33 @@ void Tester<T,U>::PrintErrorLog(const std::vector<ErrorLogEntry> &error_log) {
 // Compares two floating point values and returns whether they are within an acceptable error
 // margin. This replaces GTest's EXPECT_NEAR().
 template <typename T>
-bool TestSimilarity(const T val1, const T val2) {
+bool TestSimilarityNear(const T val1, const T val2,
+                        const T error_margin_absolute, const T error_margin_relative) {
   const auto difference = std::fabs(val1 - val2);
 
-  // Set the allowed error margin for floating-point comparisons
-  constexpr auto kErrorMarginRelative = T(0.025);
-  constexpr auto kErrorMarginAbsolute = T(1.0e-3);
-
   // Shortcut, handles infinities
   if (val1 == val2) {
     return true;
   }
   // The values are zero or very small: the relative error is less meaningful
-  else if (val1 == 0 || val2 == 0 || difference < kErrorMarginAbsolute) {
-    return (difference < kErrorMarginAbsolute);
+  else if (val1 == 0 || val2 == 0 || difference < error_margin_absolute) {
+    return (difference < error_margin_absolute);
   }
   // Use relative error
   else {
     const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
-    return (difference / absolute_sum) < kErrorMarginRelative;
+    return (difference / absolute_sum) < error_margin_relative;
   }
 }
 
+// Default method for similarity testing
+template <typename T>
+bool TestSimilarity(const T val1, const T val2) {
+  constexpr auto kErrorMarginRelative = T(0.025);
+  constexpr auto kErrorMarginAbsolute = T(0.001);
+  return TestSimilarityNear(val1, val2, kErrorMarginRelative, kErrorMarginAbsolute);
+}
+
 // Compiles the default case for standard data-types
 template bool TestSimilarity<float>(const float, const float);
 template bool TestSimilarity<double>(const double, const double);
@@ -399,7 +428,10 @@ bool TestSimilarity(const double2 val1, const double2 val2) {
 }
 template <>
 bool TestSimilarity(const half val1, const half val2) {
-  return TestSimilarity(HalfToFloat(val1), HalfToFloat(val2));
+  constexpr auto kErrorMarginRelative = float(0.050);
+  constexpr auto kErrorMarginAbsolute = float(0.002);
+  return TestSimilarityNear(HalfToFloat(val1), HalfToFloat(val2),
+                            kErrorMarginRelative, kErrorMarginAbsolute);
 }
 
 // =================================================================================================
diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp
index 422da9ed..13c28e3d 100644
--- a/test/correctness/tester.hpp
+++ b/test/correctness/tester.hpp
@@ -39,26 +39,26 @@ class Tester {
  public:
 
   // Maximum number of test results printed on a single line
-  static constexpr auto kResultsPerLine = size_t{64};
+  static const size_t kResultsPerLine;
 
   // Error percentage is not applicable: error was caused by an incorrect status
-  static constexpr auto kStatusError = -1.0f;
+  static const float kStatusError;
 
   // Constants holding start and end strings for terminal-output in colour
-  const std::string kPrintError{"\x1b[31m"};
-  const std::string kPrintSuccess{"\x1b[32m"};
-  const std::string kPrintWarning{"\x1b[35m"};
-  const std::string kPrintMessage{"\x1b[1m"};
-  const std::string kPrintEnd{"\x1b[0m"};
+  static const std::string kPrintError;
+  static const std::string kPrintSuccess;
+  static const std::string kPrintWarning;
+  static const std::string kPrintMessage;
+  static const std::string kPrintEnd;
 
   // Sets the output error coding
-  const std::string kSuccessData{kPrintSuccess + ":" + kPrintEnd};
-  const std::string kSuccessStatus{kPrintSuccess + "." + kPrintEnd};
-  const std::string kErrorData{kPrintError + "X" + kPrintEnd};
-  const std::string kErrorStatus{kPrintError + "/" + kPrintEnd};
-  const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
-  const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};
-  const std::string kUnsupportedReference{kPrintWarning + "-" + kPrintEnd};
+  static const std::string kSuccessData;
+  static const std::string kSuccessStatus;
+  static const std::string kErrorData;
+  static const std::string kErrorStatus;
+  static const std::string kSkippedCompilation;
+  static const std::string kUnsupportedPrecision;
+  static const std::string kUnsupportedReference;
 
   // This structure combines the above log-entry with a status code an error percentage
   struct ErrorLogEntry {
diff --git a/test/performance/client.cpp b/test/performance/client.cpp
index aaaab22e..cbb10d10 100644
--- a/test/performance/client.cpp
+++ b/test/performance/client.cpp
@@ -178,13 +178,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
     std::vector<T> c_source(args.c_size);
     std::vector<T> ap_source(args.ap_size);
     std::vector<T> scalar_source(args.scalar_size);
-    PopulateVector(x_source);
-    PopulateVector(y_source);
-    PopulateVector(a_source);
-    PopulateVector(b_source);
-    PopulateVector(c_source);
-    PopulateVector(ap_source);
-    PopulateVector(scalar_source);
+    PopulateVector(x_source, kSeed);
+    PopulateVector(y_source, kSeed);
+    PopulateVector(a_source, kSeed);
+    PopulateVector(b_source, kSeed);
+    PopulateVector(c_source, kSeed);
+    PopulateVector(ap_source, kSeed);
+    PopulateVector(scalar_source, kSeed);
 
     // Creates the matrices on the device
     auto x_vec = Buffer<T>(context, args.x_size);
diff --git a/test/performance/client.hpp b/test/performance/client.hpp
index 6d35fced..381ba158 100644
--- a/test/performance/client.hpp
+++ b/test/performance/client.hpp
@@ -40,6 +40,7 @@ namespace clblast {
 template <typename T, typename U>
 class Client {
  public:
+  static constexpr auto kSeed = 42; // fixed seed for reproducibility
 
   // Shorthand for the routine-specific functions passed to the tester
   using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp
index 4423845e..f98bdb06 100644
--- a/test/routines/level1/xamax.hpp
+++ b/test/routines/level1/xamax.hpp
@@ -76,7 +76,7 @@ class TestXamax {
                           buffers.scalar(), args.imax_offset,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp
index b1f02dcd..64aa37c2 100644
--- a/test/routines/level1/xasum.hpp
+++ b/test/routines/level1/xasum.hpp
@@ -76,7 +76,7 @@ class TestXasum {
                           buffers.scalar(), args.asum_offset,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp
index c276a42e..b24e6fe8 100644
--- a/test/routines/level1/xaxpy.hpp
+++ b/test/routines/level1/xaxpy.hpp
@@ -77,7 +77,7 @@ class TestXaxpy {
                        buffers.x_vec(), args.x_offset, args.x_inc,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp
index a96bb9ae..87bc21d4 100644
--- a/test/routines/level1/xcopy.hpp
+++ b/test/routines/level1/xcopy.hpp
@@ -76,7 +76,7 @@ class TestXcopy {
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           buffers.y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp
index f6cf2809..c4f6076a 100644
--- a/test/routines/level1/xdot.hpp
+++ b/test/routines/level1/xdot.hpp
@@ -81,7 +81,7 @@ class TestXdot {
                          buffers.x_vec(), args.x_offset, args.x_inc,
                          buffers.y_vec(), args.y_offset, args.y_inc,
                          &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp
index 2b00d04b..aae892a8 100644
--- a/test/routines/level1/xdotc.hpp
+++ b/test/routines/level1/xdotc.hpp
@@ -81,7 +81,7 @@ class TestXdotc {
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           buffers.y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp
index 31a867e0..f6be385b 100644
--- a/test/routines/level1/xdotu.hpp
+++ b/test/routines/level1/xdotu.hpp
@@ -81,7 +81,7 @@ class TestXdotu {
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           buffers.y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp
index 62d649e3..e604077c 100644
--- a/test/routines/level1/xnrm2.hpp
+++ b/test/routines/level1/xnrm2.hpp
@@ -76,7 +76,7 @@ class TestXnrm2 {
                           buffers.scalar(), args.nrm2_offset,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp
index 79926890..3c438bd6 100644
--- a/test/routines/level1/xscal.hpp
+++ b/test/routines/level1/xscal.hpp
@@ -72,7 +72,7 @@ class TestXscal {
     auto status = Scal(args.n, args.alpha,
                        buffers.x_vec(), args.x_offset, args.x_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp
index 8f7e4cfe..a0491f12 100644
--- a/test/routines/level1/xswap.hpp
+++ b/test/routines/level1/xswap.hpp
@@ -76,7 +76,7 @@ class TestXswap {
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           buffers.y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp
index 5a907077..5ed92aae 100644
--- a/test/routines/level2/xgbmv.hpp
+++ b/test/routines/level2/xgbmv.hpp
@@ -90,7 +90,7 @@ class TestXgbmv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp
index 1499b2d2..9ee6d535 100644
--- a/test/routines/level2/xgemv.hpp
+++ b/test/routines/level2/xgemv.hpp
@@ -90,7 +90,7 @@ class TestXgemv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp
index 5cbed505..42283107 100644
--- a/test/routines/level2/xger.hpp
+++ b/test/routines/level2/xger.hpp
@@ -86,7 +86,7 @@ class TestXger {
                       buffers.y_vec(), args.y_offset, args.y_inc,
                       buffers.a_mat(), args.a_offset, args.a_ld,
                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp
index d50092cb..ef69c197 100644
--- a/test/routines/level2/xgerc.hpp
+++ b/test/routines/level2/xgerc.hpp
@@ -86,7 +86,7 @@ class TestXgerc {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp
index 9c823b73..b2afc6d8 100644
--- a/test/routines/level2/xgeru.hpp
+++ b/test/routines/level2/xgeru.hpp
@@ -86,7 +86,7 @@ class TestXgeru {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp
index 01cb3f51..8bda4d0c 100644
--- a/test/routines/level2/xhbmv.hpp
+++ b/test/routines/level2/xhbmv.hpp
@@ -84,7 +84,7 @@ class TestXhbmv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp
index dadd3975..80565d04 100644
--- a/test/routines/level2/xhemv.hpp
+++ b/test/routines/level2/xhemv.hpp
@@ -84,7 +84,7 @@ class TestXhemv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp
index b21c0a9b..d71c8009 100644
--- a/test/routines/level2/xher.hpp
+++ b/test/routines/level2/xher.hpp
@@ -79,7 +79,7 @@ class TestXher {
                       buffers.x_vec(), args.x_offset, args.x_inc,
                       buffers.a_mat(), args.a_offset, args.a_ld,
                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp
index 070f823c..083dfa2f 100644
--- a/test/routines/level2/xher2.hpp
+++ b/test/routines/level2/xher2.hpp
@@ -84,7 +84,7 @@ class TestXher2 {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp
index d7f9634e..1dd63562 100644
--- a/test/routines/level2/xhpmv.hpp
+++ b/test/routines/level2/xhpmv.hpp
@@ -84,7 +84,7 @@ class TestXhpmv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp
index 8f44a68d..a5c77811 100644
--- a/test/routines/level2/xhpr.hpp
+++ b/test/routines/level2/xhpr.hpp
@@ -79,7 +79,7 @@ class TestXhpr {
                       buffers.x_vec(), args.x_offset, args.x_inc,
                       buffers.ap_mat(), args.ap_offset,
                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp
index 666a8dfc..d09178f0 100644
--- a/test/routines/level2/xhpr2.hpp
+++ b/test/routines/level2/xhpr2.hpp
@@ -84,7 +84,7 @@ class TestXhpr2 {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.ap_mat(), args.ap_offset,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp
index fd5dd68e..8e0f8321 100644
--- a/test/routines/level2/xsbmv.hpp
+++ b/test/routines/level2/xsbmv.hpp
@@ -84,7 +84,7 @@ class TestXsbmv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp
index 63286248..977f733a 100644
--- a/test/routines/level2/xspmv.hpp
+++ b/test/routines/level2/xspmv.hpp
@@ -84,7 +84,7 @@ class TestXspmv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp
index f9dead53..93da4b73 100644
--- a/test/routines/level2/xspr.hpp
+++ b/test/routines/level2/xspr.hpp
@@ -79,7 +79,7 @@ class TestXspr {
                       buffers.x_vec(), args.x_offset, args.x_inc,
                       buffers.ap_mat(), args.ap_offset,
                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp
index a2f22098..b835f2b0 100644
--- a/test/routines/level2/xspr2.hpp
+++ b/test/routines/level2/xspr2.hpp
@@ -84,7 +84,7 @@ class TestXspr2 {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.ap_mat(), args.ap_offset,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp
index 0d3ca632..0ec96f1d 100644
--- a/test/routines/level2/xsymv.hpp
+++ b/test/routines/level2/xsymv.hpp
@@ -84,7 +84,7 @@ class TestXsymv {
                        buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp
index 15ad9595..b49132e3 100644
--- a/test/routines/level2/xsyr.hpp
+++ b/test/routines/level2/xsyr.hpp
@@ -79,7 +79,7 @@ class TestXsyr {
                       buffers.x_vec(), args.x_offset, args.x_inc,
                       buffers.a_mat(), args.a_offset, args.a_ld,
                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp
index a9a61a1f..7c65daa2 100644
--- a/test/routines/level2/xsyr2.hpp
+++ b/test/routines/level2/xsyr2.hpp
@@ -84,7 +84,7 @@ class TestXsyr2 {
                        buffers.y_vec(), args.y_offset, args.y_inc,
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp
index 54e7fe18..cf30c2f7 100644
--- a/test/routines/level2/xtbmv.hpp
+++ b/test/routines/level2/xtbmv.hpp
@@ -78,7 +78,7 @@ class TestXtbmv {
                           buffers.a_mat(), args.a_offset, args.a_ld,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp
index 9776c4de..d08e132f 100644
--- a/test/routines/level2/xtpmv.hpp
+++ b/test/routines/level2/xtpmv.hpp
@@ -78,7 +78,7 @@ class TestXtpmv {
                           buffers.ap_mat(), args.ap_offset,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp
index 18300e50..cf9a0063 100644
--- a/test/routines/level2/xtrmv.hpp
+++ b/test/routines/level2/xtrmv.hpp
@@ -78,7 +78,7 @@ class TestXtrmv {
                           buffers.a_mat(), args.a_offset, args.a_ld,
                           buffers.x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp
index 5f9bea81..bca3c049 100644
--- a/test/routines/level3/xgemm.hpp
+++ b/test/routines/level3/xgemm.hpp
@@ -92,7 +92,7 @@ class TestXgemm {
                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
                        buffers.c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp
index 8c44be25..31c7695f 100644
--- a/test/routines/level3/xhemm.hpp
+++ b/test/routines/level3/xhemm.hpp
@@ -92,7 +92,7 @@ class TestXhemm {
                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
                        buffers.c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp
index fd20bbb5..ff2bb6cb 100644
--- a/test/routines/level3/xher2k.hpp
+++ b/test/routines/level3/xher2k.hpp
@@ -91,7 +91,7 @@ class TestXher2k {
                         buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
                         buffers.c_mat(), args.c_offset, args.c_ld,
                         &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp
index 12990d39..26396fa9 100644
--- a/test/routines/level3/xherk.hpp
+++ b/test/routines/level3/xherk.hpp
@@ -82,7 +82,7 @@ class TestXherk {
                        buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
                        buffers.c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp
index f8e90927..c84c22b4 100644
--- a/test/routines/level3/xsymm.hpp
+++ b/test/routines/level3/xsymm.hpp
@@ -92,7 +92,7 @@ class TestXsymm {
                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
                        buffers.c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp
index 4e4ba0b7..5c4976e2 100644
--- a/test/routines/level3/xsyr2k.hpp
+++ b/test/routines/level3/xsyr2k.hpp
@@ -90,7 +90,7 @@ class TestXsyr2k {
                         buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
                         buffers.c_mat(), args.c_offset, args.c_ld,
                         &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp
index f5509c88..98c4f6a4 100644
--- a/test/routines/level3/xsyrk.hpp
+++ b/test/routines/level3/xsyrk.hpp
@@ -82,7 +82,7 @@ class TestXsyrk {
                        buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
                        buffers.c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp
index 45e17e45..55b51e54 100644
--- a/test/routines/level3/xtrmm.hpp
+++ b/test/routines/level3/xtrmm.hpp
@@ -82,7 +82,7 @@ class TestXtrmm {
                        buffers.a_mat(), args.a_offset, args.a_ld,
                        buffers.b_mat(), args.b_offset, args.b_ld,
                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }
 
diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp
index 4637c07e..dccb3583 100644
--- a/test/routines/levelx/xomatcopy.hpp
+++ b/test/routines/levelx/xomatcopy.hpp
@@ -77,7 +77,7 @@ class TestXomatcopy {
                               buffers.a_mat(), args.a_offset, args.a_ld,
                               buffers.b_mat(), args.b_offset, args.b_ld,
                               &queue_plain, &event);
-    clWaitForEvents(1, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
   }