72 files changed, 8676 insertions, 6 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..604b0a64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+build
+stash
+.*
+\ No newline at end of file
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 00000000..817d3d80
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,7 @@
+
+Version 0.1.0
+- Initial preview version release to GitHub
+- Supported level-1 routines:
+    SAXPY/DAXPY/CAXPY/ZAXPY
+- Supported level-3 routines:
+    SGEMM/DGEMM, SSYMM/DSYMM
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000..6cdb3e46
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,205 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+# width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# ==================================================================================================
+
+# CMake project details
+cmake_minimum_required(VERSION 2.8.10)
+project("clblast" CXX)
+set(clblast_VERSION_MAJOR 0)
+set(clblast_VERSION_MINOR 1)
+set(clblast_VERSION_PATCH 0)
+
+# Options and their default values
+option(SAMPLES "Enable compilation of the examples" OFF)
+option(TUNERS "Enable compilation of the tuners" OFF)
+option(TESTS "Enable compilation of the performance and correctness tests" OFF)
+
+# ==================================================================================================
+
+# RPATH settings
+set(CMAKE_SKIP_BUILD_RPATH false) # Use, i.e. don't skip the full RPATH for the build tree
+set(CMAKE_BUILD_WITH_INSTALL_RPATH false) # When building, don't use the install RPATH already
+set(CMAKE_INSTALL_RPATH "") # The RPATH to be used when installing
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically determined parts
+
+# ==================================================================================================
+
+# Compiler-version check (requires at least CMake 2.8.10)
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
+    message(FATAL_ERROR "GCC version must be at least 4.7")
+  endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3)
+    message(FATAL_ERROR "Clang version must be at least 3.3")
+  endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
+    message(FATAL_ERROR "AppleClang version must be at least 5.0")
+  endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0)
+    message(FATAL_ERROR "ICC version must be at least 14.0")
+  endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0)
+    message(FATAL_ERROR "MS Visual Studio version must be at least 18.0")
+  endif()
+endif()
+
+# C++ compiler settings
+set(FLAGS "-O3 -std=c++11")
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
+    set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
+  endif()
+elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
+  set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
+  set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
+endif()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
+
+# ==================================================================================================
+
+# Package scripts location
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+
+# Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH.
+find_package(OpenCL REQUIRED)
+
+# Locates the CLTune library in case the tuners need to be compiled. "FindCLTune.cmake" is included.
+if(TUNERS)
+  find_package(CLTune)
+  if(NOT CLTUNE_FOUND)
+    message(STATUS "Could NOT find CLTune, disabling the compilation of the tuners")
+    set(TUNERS OFF)
+  endif()
+endif()
+
+# ==================================================================================================
+
+# Includes directories: CLBlast and OpenCL
+include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
+
+# ==================================================================================================
+
+# Sets the supported routines and the used kernels. New routines and kernels should be added here.
+set(KERNELS copy pad transpose padtranspose xaxpy xgemm)
+set(SAMPLE_PROGRAMS sgemm)
+set(ROUTINES_XY xaxpy)
+set(ROUTINES_ABC xgemm xsymm)
+set(ROUTINES ${ROUTINES_XY} ${ROUTINES_ABC})
+
+# ==================================================================================================
+
+# Gathers all source-files
+set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc)
+foreach(ROUTINE ${ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/${ROUTINE}.cc)
+endforeach()
+
+# Creates and links the library
+add_library(clblast SHARED ${SOURCES})
+target_link_libraries(clblast ${OPENCL_LIBRARIES})
+
+# Installs the library
+install(TARGETS clblast DESTINATION lib)
+install(FILES include/clblast.h DESTINATION include)
+
+# ==================================================================================================
+
+# This section contains all the code related to the examples
+if(SAMPLES)
+
+  # Adds sample programs
+  foreach(SAMPLE ${SAMPLE_PROGRAMS})
+    add_executable(sample_${SAMPLE} samples/${SAMPLE}.cc)
+    target_link_libraries(sample_${SAMPLE} clblast ${OPENCL_LIBRARIES})
+    install(TARGETS sample_${SAMPLE} DESTINATION bin)
+  endforeach()
+
+endif()
+
+# ==================================================================================================
+
+# This section contains all the code related to the tuners. These tuners require the presence of
+# the CLTune library (not included as part of the source).
+if(TUNERS)
+
+  # Includes CLTune
+  include_directories(${CLTUNE_INCLUDE_DIRS})
+
+  # Creates the common tuner objects (requires CMake 2.8.8)
+  add_library(tuners_common OBJECT src/tuning/tuning.cc)
+
+  # Adds tuning executables
+  foreach(KERNEL ${KERNELS})
+    add_executable(tuner_${KERNEL} $<TARGET_OBJECTS:tuners_common> src/tuning/${KERNEL}.cc)
+    target_link_libraries(tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
+    install(TARGETS tuner_${KERNEL} DESTINATION bin)
+  endforeach()
+
+endif()
+
+# ==================================================================================================
+
+# Down from here is all test (performance and correctness) related. Note that these tests require
+# the compilation of the clBLAS library to act as a reference.
+if(TESTS)
+
+  # Enables clBLAS to provide as reference for the tests (source-code is shipped with the project).
+  # This subproject uses specific flags to reduce the amount of warnings.
+  set(CMAKE_CXX_FLAGS_CLBLAST ${CMAKE_CXX_FLAGS})
+  set(CMAKE_CXX_FLAGS "-O3")
+  add_subdirectory(external/clBLAS/src)
+  set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS_CLBLAST})
+
+  # Adds new include directories for the reference clBLAS
+  include_directories(${clblast_SOURCE_DIR}/test ${clBLAS_SOURCE_DIR})
+
+  # Creates the common correctness-tests objects (requires CMake 2.8.8)
+  add_library(test_correctness_common OBJECT test/correctness/tester.cc)
+  add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
+  add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)
+
+  # Compiles the correctness-tests
+  foreach(ROUTINE ${ROUTINES_XY})
+    add_executable(test_${ROUTINE}
+                   $<TARGET_OBJECTS:test_correctness_common>
+                   $<TARGET_OBJECTS:test_correctness_xy>
+                   test/correctness/routines/${ROUTINE}.cc)
+    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
+    install(TARGETS test_${ROUTINE} DESTINATION bin)
+  endforeach()
+  foreach(ROUTINE ${ROUTINES_ABC})
+    add_executable(test_${ROUTINE}
+                   $<TARGET_OBJECTS:test_correctness_common>
+                   $<TARGET_OBJECTS:test_correctness_abc>
+                   test/correctness/routines/${ROUTINE}.cc)
+    target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
+    install(TARGETS test_${ROUTINE} DESTINATION bin)
+  endforeach()
+
+  # Creates the common performance-tests objects (requires CMake 2.8.8)
+  add_library(test_performance_common OBJECT test/performance/client.cc)
+
+  # Compiles the performance-tests
+  set(TEST_PERF_COMM )
+  foreach(ROUTINE ${ROUTINES})
+    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
+                   test/performance/routines/${ROUTINE}.cc)
+    target_link_libraries(client_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
+    install(TARGETS client_${ROUTINE} DESTINATION bin)
+  endforeach()
+
+endif()
+# ==================================================================================================
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..ae43189f
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,14 @@
+
+Copyright (c) 2015 Cedric Nugteren
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/README.md b/README.md
index 4d820626..1788e0d1 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Why CLBlast and not clBLAS or cuBLAS?
 
 Use CLBlast instead of clBLAS:
 
-* When you care about performance (and you should).
+* When you care about achieving maximum performance.
 * When you want to be able to inspect the BLAS kernels or easily customize them to your needs.
 * When you run on exotic OpenCL devices which you need to tune yourself.
 
@@ -89,7 +89,7 @@ Note that CLBlast's tuners are based on the CLTune auto-tuning library, which ha
 
 Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance.
 
-The tuner will output a C++ database compatible line with the results, which can be added to `include/internal/database/xxxxx.h` in the appropriate section. Or, if tuning parameters already exist for your device but you believe they can be improved, this is also the place where they can be modified. If you want the found parameters to be included in future releases of CLBlast, please post the results in the corresponding issue on GitHub or [email the main author](www.cedricnugteren.nl).
+The tuner will output a C++ database compatible line with the results, which can be added to `include/internal/database/xxxxx.h` in the appropriate section. Or, if tuning parameters already exist for your device but you believe they can be improved, this is also the place where they can be modified. If you want the found parameters to be included in future releases of CLBlast, please post the results in the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
 
 
 Compiling the tests (optional)
@@ -99,7 +99,7 @@ To make sure CLBlast is working correctly on your device (recommended), compile
 
     cmake -DTESTS=ON ..
 
-Afterwards, executables in the form of `test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](github.com/clMathLibraries/clBLAS) for correctness. However, it is not required to install clBLAS separately on your system: it is included as part of the CLBlast source code in `external/clBLAS`.
+Afterwards, executables in the form of `test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. However, it is not required to install clBLAS separately on your system: it is included as part of the CLBlast source code in `external/clBLAS`.
 
 With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run both CLBlast and clBLAS in a head-to-head performance test.
 
@@ -109,7 +109,7 @@ Performance remarks
 
 The CLBlast library provides pre-tuned parameter-values for a number of OpenCL devices. If your device is not among these, then out-of-the-box performance might be poor. Even if the device is included performance might be poor in some cases: __the preview version is not thoroughly tested for performance yet__. See above under `Using the tuners` to find out how to tune for your device.
 
-The folder `doc/performance` contains some PDF files with performance results on tested devices. The graphs of the level-3 routines (e.g. Xgemm) show the strong points of CLBlast:
+The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm and Xsymm) show the strong points of CLBlast:
 
 * The library reaches a high peak performance for large matrix sizes, in some cases a factor 2 more than clBLAS.
 * The performance for non-power of 2 values (e.g. 1000) is roughly equal to power of 2 cases (e.g. 1024). This is not the case for clBLAS, which sometimes shows a drop of a factor 2.
@@ -193,13 +193,13 @@ Contributions are welcome in the form of tuning results for OpenCL devices previ
 
 The contributing authors so far are:
 
-* [Cedric Nugteren](www.cedricnugteren.nl)
+* [Cedric Nugteren](http://www.cedricnugteren.nl)
 
 
 Support us
 -------------
 
-This project started in March 2015 as an evenings and weekends free-time project next to a full-time job. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](www.cedricnugteren.nl).
+This project started in March 2015 as an evenings and weekends free-time project next to a full-time job. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl).
 
 
 To-do list before release of version 1.0
diff --git a/cmake/Modules/FindCLTune.cmake b/cmake/Modules/FindCLTune.cmake
new file mode 100644
index 00000000..3a37576a
--- /dev/null
+++ b/cmake/Modules/FindCLTune.cmake
@@ -0,0 +1,68 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+# width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# ==================================================================================================
+#
+# Defines the following variables:
+#   CLTUNE_FOUND          Boolean holding whether or not the CLTune library was found
+#   CLTUNE_INCLUDE_DIRS   The CLTune include directory
+#   CLTUNE_LIBRARIES      The CLTune library
+#
+# In case CLTune is not installed in the default directory, set the CLTUNE_ROOT variable to point to
+# the root of CLTune, such that 'cltune.h' can be found in $CLTUNE_ROOT/include. This can either be
+# done using an environmental variable (e.g. export CLTUNE_ROOT=/path/to/cltune) or using a CMake
+# variable (e.g. cmake -DCLTUNE_ROOT=/path/to/cltune ..).
+#
+# ==================================================================================================
+
+# Sets the possible install locations
+set(CLTUNE_HINTS
+  ${CLTUNE_ROOT}
+  $ENV{CLTUNE_ROOT}
+)
+set(CLTUNE_PATHS
+  /usr
+  /usr/local
+)
+
+# Finds the include directories
+find_path(CLTUNE_INCLUDE_DIRS
+  NAMES cltune.h
+  HINTS ${CLTUNE_HINTS}
+  PATH_SUFFIXES include inc include/x86_64 include/x64
+  PATHS ${CLTUNE_PATHS}
+  DOC "CLTune include header cltune.h"
+)
+mark_as_advanced(CLTUNE_INCLUDE_DIRS)
+
+# Finds the library
+find_library(CLTUNE_LIBRARIES
+  NAMES cltune
+  HINTS ${CLTUNE_HINTS}
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32
+  PATHS ${CLTUNE_PATHS}
+  DOC "CLTune library"
+)
+mark_as_advanced(CLTUNE_LIBRARIES)
+
+# ==================================================================================================
+
+# Notification messages
+if(NOT CLTUNE_INCLUDE_DIRS)
+    message(STATUS "Could NOT find 'cltune.h', install CLTune or set CLTUNE_ROOT")
+endif()
+if(NOT CLTUNE_LIBRARIES)
+    message(STATUS "Could NOT find CLTune library, install it or set CLTUNE_ROOT")
+endif()
+
+# Determines whether or not CLTune was found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CLTune DEFAULT_MSG CLTUNE_INCLUDE_DIRS CLTUNE_LIBRARIES)
+
+# ==================================================================================================
diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake
new file mode 100644
index 00000000..2a4c583c
--- /dev/null
+++ b/cmake/Modules/FindOpenCL.cmake
@@ -0,0 +1,75 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+# width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# ==================================================================================================
+#
+# Defines the following variables:
+#   OPENCL_FOUND          Boolean holding whether or not the OpenCL library was found
+#   OPENCL_INCLUDE_DIRS   The OpenCL include directory
+#   OPENCL_LIBRARIES      The OpenCL library
+#
+# In case OpenCL is not installed in the default directory, set the OPENCL_ROOT variable to point to
+# the root of OpenCL, such that 'OpenCL/cl.h' or 'CL/cl.h' can be found in $OPENCL_ROOT/include.
+# This can either be done using an environmental variable (e.g. export OPENCL_ROOT=/path/to/opencl)
+# or using a CMake variable (e.g. cmake -DOPENCL_ROOT=/path/to/opencl ..).
+#
+# ==================================================================================================
+
+# Sets the possible install locations
+set(OPENCL_HINTS
+  ${OPENCL_ROOT}
+  $ENV{OPENCL_ROOT}
+  $ENV{AMDAPPSDKROOT}
+  $ENV{CUDA_PATH}
+  $ENV{INTELOCLSDKROOT}
+  $ENV{NVSDKCOMPUTE_ROOT}
+  $ENV{ATISTREAMSDKROOT}
+)
+set(OPENCL_PATHS
+  /usr/local/cuda
+  /opt/cuda
+  /usr
+  /usr/local
+)
+
+# Finds the include directories
+find_path(OPENCL_INCLUDE_DIRS
+  NAMES OpenCL/cl.h CL/cl.h
+  HINTS ${OPENCL_HINTS}
+  PATH_SUFFIXES include OpenCL/common/inc inc include/x86_64 include/x64
+  PATHS ${OPENCL_PATHS}
+  DOC "OpenCL include header OpenCL/cl.h or CL/cl.h"
+)
+mark_as_advanced(OPENCL_INCLUDE_DIRS)
+
+# Finds the library
+find_library(OPENCL_LIBRARIES
+  NAMES OpenCL
+  HINTS ${OPENCL_HINTS}
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64
+  PATHS ${OPENCL_PATHS}
+  DOC "OpenCL library"
+)
+mark_as_advanced(OPENCL_LIBRARIES)
+
+# ==================================================================================================
+
+# Notification messages
+if(NOT OPENCL_INCLUDE_DIRS)
+    message(STATUS "Could NOT find 'OpenCL/cl.h' or 'CL/cl.h', install OpenCL or set OPENCL_ROOT")
+endif()
+if(NOT OPENCL_LIBRARIES)
+    message(STATUS "Could NOT find OpenCL library, install it or set OPENCL_ROOT")
+endif()
+
+# Determines whether or not OpenCL was found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(OpenCL DEFAULT_MSG OPENCL_INCLUDE_DIRS OPENCL_LIBRARIES)
+
+# ==================================================================================================
diff --git a/doc/performance/GeForce_GTX480/SAXPY.pdf b/doc/performance/GeForce_GTX480/SAXPY.pdf
new file mode 100644
index 00000000..29bf0056
--- /dev/null
+++ b/doc/performance/GeForce_GTX480/SAXPY.pdf
diff --git a/doc/performance/GeForce_GTX480/SGEMM.pdf b/doc/performance/GeForce_GTX480/SGEMM.pdf
new file mode 100644
index 00000000..ac6e59c8
--- /dev/null
+++ b/doc/performance/GeForce_GTX480/SGEMM.pdf
diff --git a/doc/performance/GeForce_GTX480/SSYMM.pdf b/doc/performance/GeForce_GTX480/SSYMM.pdf
new file mode 100644
index 00000000..ca532190
--- /dev/null
+++ b/doc/performance/GeForce_GTX480/SSYMM.pdf
diff --git a/doc/performance/Iris/SAXPY.pdf b/doc/performance/Iris/SAXPY.pdf
new file mode 100644
index 00000000..2d9e99e8
--- /dev/null
+++ b/doc/performance/Iris/SAXPY.pdf
diff --git a/doc/performance/Iris/SGEMM.pdf b/doc/performance/Iris/SGEMM.pdf
new file mode 100644
index 00000000..fe671e71
--- /dev/null
+++ b/doc/performance/Iris/SGEMM.pdf
diff --git a/doc/performance/Iris/SSYMM.pdf b/doc/performance/Iris/SSYMM.pdf
new file mode 100644
index 00000000..61eb9848
--- /dev/null
+++ b/doc/performance/Iris/SSYMM.pdf
diff --git a/doc/performance/Tesla_K40m/SAXPY.pdf b/doc/performance/Tesla_K40m/SAXPY.pdf
new file mode 100644
index 00000000..778eb94d
--- /dev/null
+++ b/doc/performance/Tesla_K40m/SAXPY.pdf
diff --git a/doc/performance/Tesla_K40m/SGEMM.pdf b/doc/performance/Tesla_K40m/SGEMM.pdf
new file mode 100644
index 00000000..0b5891d5
--- /dev/null
+++ b/doc/performance/Tesla_K40m/SGEMM.pdf
diff --git a/doc/performance/Tesla_K40m/SSYMM.pdf b/doc/performance/Tesla_K40m/SSYMM.pdf
new file mode 100644
index 00000000..f62bcc98
--- /dev/null
+++ b/doc/performance/Tesla_K40m/SSYMM.pdf
diff --git a/include/clblast.h b/include/clblast.h
new file mode 100644
index 00000000..4c3c5201
--- /dev/null
+++ b/include/clblast.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the interface to the CLBlast BLAS routines. It also contains the definitions
+// of the returned status codes and the layout and transpose types. This is the only header users
+// of CLBlast should include and use.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLBLAST_H_
+#define CLBLAST_CLBLAST_H_
+
+#include <cstdlib> // For size_t
+
+// Includes the normal OpenCL C header
+#if defined(__APPLE__) || defined(__MACOSX)
+  #include <OpenCL/opencl.h>
+#else
+  #include <CL/opencl.h>
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// Status codes. These codes can be returned by functions declared in this header file. The error
+// codes match either the standard OpenCL error codes or the clBLAS error codes. 
+enum class StatusCode {
+
+  // Status codes in common with the OpenCL standard
+  kSuccess                   =   0, // CL_SUCCESS
+  kTempBufferAllocFailure    =  -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
+  kBuildProgramFailure       = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
+  kInvalidBinary             = -42, // CL_INVALID_BINARY
+  kInvalidKernel             = -48, // CL_INVALID_KERNEL
+  kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
+  kInvalidLocalThreadsTotal  = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
+  kInvalidLocalThreadsDim    = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
+  kInvalidTempBufferSize     = -61, // CL_INVALID_BUFFER_SIZE
+
+  // Status codes in common with the clBLAS library
+  kNotImplemented            = -1024, // Routine or functionality not implemented yet
+  kInvalidMatrixA            = -1022, // Matrix A is not a valid OpenCL buffer
+  kInvalidMatrixB            = -1021, // Matrix B is not a valid OpenCL buffer
+  kInvalidMatrixC            = -1020, // Matrix C is not a valid OpenCL buffer
+  kInvalidVectorX            = -1019, // Vector X is not a valid OpenCL buffer
+  kInvalidVectorY            = -1018, // Vector Y is not a valid OpenCL buffer
+  kInvalidDimension          = -1017, // Dimensions M, N, and K have to be larger than zero
+  kInvalidLeadDimA           = -1016, // LD of A is smaller than the matrix's first dimension
+  kInvalidLeadDimB           = -1015, // LD of B is smaller than the matrix's first dimension
+  kInvalidLeadDimC           = -1014, // LD of C is smaller than the matrix's first dimension
+  kInvalidIncrementX         = -1013, // Increment of vector X cannot be zero
+  kInvalidIncrementY         = -1012, // Increment of vector Y cannot be zero
+  kInsufficientMemoryA       = -1011, // Matrix A's OpenCL buffer is too small
+  kInsufficientMemoryB       = -1010, // Matrix B's OpenCL buffer is too small
+  kInsufficientMemoryC       = -1009, // Matrix C's OpenCL buffer is too small
+  kInsufficientMemoryX       = -1008, // Vector X's OpenCL buffer is too small
+  kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small
+
+  // Custom additional status codes for CLBlast
+  kKernelLaunchError         = -2048, // Problem occurred when enqueuing the kernel
+  kKernelRunError            = -2047, // Problem occurred while running the kernel
+  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
+  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
+  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
+};
+
+// Matrix layout and transpose types
+enum class Layout { kRowMajor, kColMajor };
+enum class Transpose { kNo, kYes, kConjugate };
+enum class Side { kLeft, kRight };
+enum class Triangle { kUpper, kLower };
+
+// Precision scoped enum (values in bits)
+enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
+                       kComplexSingle = 3232, kComplexDouble = 6464 };
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+
+// Templated-precision vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
+template <typename T>
+StatusCode Axpy(const size_t m, const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+
+// Templated-precision generalized matrix multiplication: SGEMM/DGEMM
+template <typename T>
+StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
+                const size_t m, const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// Templated-precision symmetric matrix multiplication: SSYMM/DSYMM
+template <typename T>
+StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_CLBLAST_H_
+#endif
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
new file mode 100644
index 00000000..73040fdb
--- /dev/null
+++ b/include/internal/clpp11.h
@@ -0,0 +1,524 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a C++11 wrapper around some OpenCL C data-types, similar to Khronos' cl.hpp.
+// The main differences are modern C++11 support and a straightforward implemenation of the basic
+// needs (as required for this project). It also includes some extra functionality not available
+// in cl.hpp, such as including the sources with a Program object and querying a Kernel's validity
+// in terms of local memory usage.
+//
+// This file is adapted from the C++ bindings from the CLTune project and therefore contains the
+// following copyright notice:
+//
+// =================================================================================================
+//
+// Copyright 2014 SURFsara
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//  http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLPP11_H_
+#define CLBLAST_CLPP11_H_
+
+#include <utility> // std::swap
+#include <algorithm> // std::copy
+#include <string> // std::string
+#include <vector> // std::vector
+#include <stdexcept> // std::runtime_error
+
+// Includes the normal OpenCL C header
+#if defined(__APPLE__) || defined(__MACOSX)
+  #include <OpenCL/opencl.h>
+#else
+  #include <CL/opencl.h>
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// Base class for any object
+class Object {
+ protected:
+
+  // Error handling (NOTE: these functions are [[noreturn]])
+  void Error(const std::string &message) const {
+    throw std::runtime_error("Internal OpenCL error: "+message);
+  }
+  void Error(const cl_int status) const {
+    throw std::runtime_error("Internal OpenCL error with status: "+std::to_string(status));
+  }
+};
+
+// =================================================================================================
+
+// Base class for objects which require memory management
+class ObjectWithState: public Object {
+
+};
+
+// =================================================================================================
+
+// C++11 version of cl_event
+class Event: public Object {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Event(const cl_event event): event_(event) { }
+
+  // New event
+  Event(): event_() {}
+
+  // Public functions
+  size_t GetProfilingStart() const {
+    auto bytes = size_t{0};
+    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
+    auto result = size_t{0};
+    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &result, nullptr);
+    return result;
+  }
+  size_t GetProfilingEnd() const {
+    auto bytes = size_t{0};
+    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
+    auto result = size_t{0};
+    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &result, nullptr);
+    return result;
+  }
+  cl_int Wait() const {
+    return clWaitForEvents(1, &event_);
+  }
+
+  // Accessors to the private data-member
+  cl_event operator()() const { return event_; }
+  cl_event& operator()() { return event_; }
+ private:
+  cl_event event_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_platform_id
+class Platform: public Object {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Platform(const cl_platform_id platform): platform_(platform) { }
+
+  // Initialize the platform. Note that this constructor can throw exceptions!
+  explicit Platform(const size_t platform_id) {
+    auto num_platforms = cl_uint{0};
+    auto status = clGetPlatformIDs(0, nullptr, &num_platforms);
+    if (status != CL_SUCCESS) { Error(status); }
+    if (num_platforms == 0) { Error("no platforms found"); }
+    auto platforms = std::vector<cl_platform_id>(num_platforms);
+    status = clGetPlatformIDs(num_platforms, platforms.data(), nullptr);
+    if (status != CL_SUCCESS) { Error(status); }
+    if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
+    platform_ = platforms[platform_id];
+  }
+
+  // Accessors to the private data-member
+  cl_platform_id operator()() const { return platform_; }
+  cl_platform_id& operator()() { return platform_; }
+ private:
+  cl_platform_id platform_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_device_id
+class Device: public Object {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Device(const cl_device_id device): device_(device) { }
+
+  // Initialize the device. Note that this constructor can throw exceptions!
+  explicit Device(const Platform &platform, const cl_device_type type, const size_t device_id) {
+    auto num_devices = cl_uint{0};
+    auto status = clGetDeviceIDs(platform(), type, 0, nullptr, &num_devices);
+    if (status != CL_SUCCESS) { Error(status); }
+    if (num_devices == 0) { Error("no devices found"); }
+    auto devices = std::vector<cl_device_id>(num_devices);
+    status = clGetDeviceIDs(platform(), type, num_devices, devices.data(), nullptr);
+    if (status != CL_SUCCESS) { Error(status); }
+    if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
+    device_ = devices[device_id];
+  }
+
+  // Public functions
+  std::string Version()     const { return GetInfoString(CL_DEVICE_VERSION); }
+  cl_device_type Type()     const { return GetInfo<cl_device_type>(CL_DEVICE_TYPE); }
+  std::string Vendor()      const { return GetInfoString(CL_DEVICE_VENDOR); }
+  std::string Name()        const { return GetInfoString(CL_DEVICE_NAME); }
+  std::string Extensions()  const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
+  size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
+  cl_ulong LocalMemSize()   const { return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE); }
+  cl_uint MaxWorkItemDimensions() const {
+    return GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+  }
+  std::vector<size_t> MaxWorkItemSizes() const {
+    return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
+  }
+
+  // Configuration-validity checks
+  bool IsLocalMemoryValid(const size_t local_mem_usage) const {
+    return (local_mem_usage <= LocalMemSize());
+  }
+  bool IsThreadConfigValid(const std::vector<size_t> &local) const {
+    auto local_size = size_t{1};
+    for (auto &item: local) { local_size *= item; }
+    for (auto i=size_t{0}; i<local.size(); ++i) {
+      if (local[i] > MaxWorkItemSizes()[i]) { return false; }
+    }
+    if (local_size > MaxWorkGroupSize()) { return false; }
+    if (local.size() > MaxWorkItemDimensions()) { return false; }
+    return true;
+  }
+
+  // Accessors to the private data-member
+  cl_device_id operator()() const { return device_; }
+  cl_device_id& operator()() { return device_; }
+ private:
+
+  // Helper functions
+  template <typename T>
+  T GetInfo(const cl_device_info info) const {
+    auto bytes = size_t{0};
+    clGetDeviceInfo(device_, info, 0, nullptr, &bytes);
+    auto result = T(0);
+    clGetDeviceInfo(device_, info, bytes, &result, nullptr);
+    return result;
+  }
+  template <typename T>
+  std::vector<T> GetInfoVector(const cl_device_info info) const {
+    auto bytes = size_t{0};
+    clGetDeviceInfo(device_, info, 0, nullptr, &bytes);
+    auto result = std::vector<T>(bytes/sizeof(T));
+    clGetDeviceInfo(device_, info, bytes, result.data(), nullptr);
+    return result;
+  }
+  std::string GetInfoString(const cl_device_info info) const {
+    auto bytes = size_t{0};
+    clGetDeviceInfo(device_, info, 0, nullptr, &bytes);
+    auto result = std::vector<char>(bytes);
+    clGetDeviceInfo(device_, info, bytes, result.data(), nullptr);
+    return std::string(result.data());
+  }
+
+  cl_device_id device_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_context
+class Context: public ObjectWithState {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Context(const cl_context context): context_(context) {
+    clRetainContext(context_);
+  }
+
+  // Memory management
+  explicit Context(const Device &device) {
+    auto status = CL_SUCCESS;
+    const cl_device_id dev = device();
+    context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
+    if (status != CL_SUCCESS) { Error(status); }
+  }
+  ~Context() {
+    clReleaseContext(context_);
+  }
+  Context(const Context &other):
+    context_(other.context_) {
+    clRetainContext(context_);
+  }
+  Context& operator=(Context other) {
+    swap(*this, other);
+    return *this;
+  }
+  friend void swap(Context &first, Context &second) {
+    std::swap(first.context_, second.context_);
+  }
+
+  // Accessors to the private data-member
+  cl_context operator()() const { return context_; }
+  cl_context& operator()() { return context_; }
+ private:
+  cl_context context_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_program. Additionally holds the program's source code.
+class Program: public ObjectWithState {
+ public:
+
+  // Note that there is no constructor based on the plain C data-type because of extra state
+
+  // Memory management
+  explicit Program(const Context &context, const std::string &source):
+    length_(source.length()) {
+      std::copy(source.begin(), source.end(), back_inserter(source_));
+      source_ptr_ = source_.data();
+      auto status = CL_SUCCESS;
+      program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
+      if (status != CL_SUCCESS) { Error(status); }
+    }
+  ~Program() {
+    clReleaseProgram(program_);
+  }
+  Program(const Program &other):
+      length_(other.length_),
+      source_(other.source_),
+      source_ptr_(other.source_ptr_),
+      program_(other.program_) {
+    clRetainProgram(program_);
+  }
+  Program& operator=(Program other) {
+    swap(*this, other);
+    return *this;
+  }
+  /*
+  TODO: Implement move construction/assignment?
+  Program(Program &&other) {
+    clRetainProgram(program_);
+    swap(*this, other);
+  }
+  Program& operator=(Program &&other) {
+    swap(*this, other);
+    return *this;
+  }*/
+  friend void swap(Program &first, Program &second) {
+    std::swap(first.length_, second.length_);
+    std::swap(first.source_, second.source_);
+    std::swap(first.source_ptr_, second.source_ptr_);
+    std::swap(first.program_, second.program_);
+  }
+
+  // Public functions
+  cl_int Build(const Device &device, const std::string &options) {
+    const cl_device_id dev = device();
+    return clBuildProgram(program_, 1, &dev, options.c_str(), nullptr, nullptr);
+  }
+  std::string GetBuildInfo(const Device &device) const {
+    auto bytes = size_t{0};
+    clGetProgramBuildInfo(program_, device(), CL_PROGRAM_BUILD_LOG, 0, nullptr, &bytes);
+    auto result = std::vector<char>(bytes);
+    clGetProgramBuildInfo(program_, device(), CL_PROGRAM_BUILD_LOG, bytes, result.data(), nullptr);
+    return std::string(result.data());
+  }
+
+  // Accessors to the private data-member
+  cl_program operator()() const { return program_; }
+  cl_program& operator()() { return program_; }
+ private:
+  size_t length_;
+  std::vector<char> source_;
+  const char* source_ptr_;
+  cl_program program_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_kernel
+class Kernel: public ObjectWithState {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Kernel(const cl_kernel kernel): kernel_(kernel) {
+    clRetainKernel(kernel_);
+  }
+
+  // Memory management
+  explicit Kernel(const Program &program, const std::string &name) {
+    auto status = CL_SUCCESS;
+    kernel_ = clCreateKernel(program(), name.c_str(), &status);
+    if (status != CL_SUCCESS) { Error(status); }
+  }
+  ~Kernel() {
+    clReleaseKernel(kernel_);
+  }
+  Kernel(const Kernel &other):
+    kernel_(other.kernel_) {
+    clRetainKernel(kernel_);
+  }
+  Kernel& operator=(Kernel other) {
+    swap(*this, other);
+    return *this;
+  }
+  friend void swap(Kernel &first, Kernel &second) {
+    std::swap(first.kernel_, second.kernel_);
+  }
+
+  // Public functions
+  template <typename T> // Note: doesn't work with T=Buffer
+  cl_int SetArgument(const cl_uint index, const T &value) {
+    return clSetKernelArg(kernel_, index, sizeof(T), &value);
+  }
+  size_t LocalMemUsage(const Device &device) const {
+    auto bytes = size_t{0};
+    clGetKernelWorkGroupInfo(kernel_, device(), CL_KERNEL_LOCAL_MEM_SIZE, 0, nullptr, &bytes);
+    auto result = size_t{0};
+    clGetKernelWorkGroupInfo(kernel_, device(), CL_KERNEL_LOCAL_MEM_SIZE, bytes, &result, nullptr);
+    return result;
+  }
+
+  // Accessors to the private data-member
+  cl_kernel operator()() const { return kernel_; }
+  cl_kernel& operator()() { return kernel_; }
+ private:
+  cl_kernel kernel_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_command_queue
+class CommandQueue: public ObjectWithState {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit CommandQueue(const cl_command_queue queue): queue_(queue) {
+    clRetainCommandQueue(queue_);
+  }
+
+  // Memory management
+  explicit CommandQueue(const Context &context, const Device &device) {
+    auto status = CL_SUCCESS;
+    queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+    if (status != CL_SUCCESS) { Error(status); }
+  }
+  ~CommandQueue() {
+    clReleaseCommandQueue(queue_);
+  }
+  CommandQueue(const CommandQueue &other):
+    queue_(other.queue_) {
+    clRetainCommandQueue(queue_);
+  }
+  CommandQueue& operator=(CommandQueue other) {
+    swap(*this, other);
+    return *this;
+  }
+  friend void swap(CommandQueue &first, CommandQueue &second) {
+    std::swap(first.queue_, second.queue_);
+  }
+
+  // Public functions
+  cl_int EnqueueKernel(const Kernel &kernel, const std::vector<size_t> &global,
+                       const std::vector<size_t> &local, Event &event) {
+    return clEnqueueNDRangeKernel(queue_, kernel(), static_cast<cl_uint>(global.size()), nullptr,
+                                  global.data(), local.data(), 0, nullptr, &(event()));
+  }
+  Context GetContext() const {
+    auto bytes = size_t{0};
+    clGetCommandQueueInfo(queue_, CL_QUEUE_CONTEXT, 0, nullptr, &bytes);
+    cl_context result;
+    clGetCommandQueueInfo(queue_, CL_QUEUE_CONTEXT, bytes, &result, nullptr);
+    return Context(result);
+  }
+  Device GetDevice() const {
+    auto bytes = size_t{0};
+    clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, 0, nullptr, &bytes);
+    cl_device_id result;
+    clGetCommandQueueInfo(queue_, CL_QUEUE_DEVICE, bytes, &result, nullptr);
+    return Device(result);
+  }
+  cl_int Finish() {
+    return clFinish(queue_);
+  }
+
+  // Accessors to the private data-member
+  cl_command_queue operator()() const { return queue_; }
+  cl_command_queue& operator()() { return queue_; }
+ private:
+  cl_command_queue queue_;
+};
+
+// =================================================================================================
+
+// C++11 version of cl_mem
+class Buffer: public ObjectWithState {
+ public:
+
+  // Constructor based on the plain C data-type
+  explicit Buffer(const cl_mem buffer): buffer_(buffer) {
+    clRetainMemObject(buffer_);
+  }
+
+  // Memory management
+  explicit Buffer(const Context &context, const cl_mem_flags flags, const size_t bytes) {
+    auto status = CL_SUCCESS;
+    buffer_ = clCreateBuffer(context(), flags, bytes, nullptr, &status);
+    if (status != CL_SUCCESS) { Error(status); }
+  }
+  ~Buffer() {
+    clReleaseMemObject(buffer_);
+  }
+  Buffer(const Buffer &other):
+    buffer_(other.buffer_) {
+    clRetainMemObject(buffer_);
+  }
+  Buffer& operator=(Buffer other) {
+    swap(*this, other);
+    return *this;
+  }
+  friend void swap(Buffer &first, Buffer &second) {
+    std::swap(first.buffer_, second.buffer_);
+  }
+
+  // Public functions
+  template <typename T>
+  cl_int ReadBuffer(const CommandQueue &queue, const size_t bytes, T* host) {
+    return clEnqueueReadBuffer(queue(), buffer_, CL_TRUE, 0, bytes, host, 0, nullptr, nullptr);
+  }
+  template <typename T>
+  cl_int ReadBuffer(const CommandQueue &queue, const size_t bytes, std::vector<T> &host) {
+    return ReadBuffer(queue, bytes, host.data());
+  }
+  template <typename T>
+  cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const T* host) {
+    return clEnqueueWriteBuffer(queue(), buffer_, CL_TRUE, 0, bytes, host, 0, nullptr, nullptr);
+  }
+  template <typename T>
+  cl_int WriteBuffer(const CommandQueue &queue, const size_t bytes, const std::vector<T> &host) {
+    return WriteBuffer(queue, bytes, &host[0]);
+  }
+  size_t GetSize() const {
+    auto bytes = size_t{0};
+    auto status = clGetMemObjectInfo(buffer_, CL_MEM_SIZE, 0, nullptr, &bytes);
+    if (status != CL_SUCCESS) { Error(status); }
+    auto result = size_t{0};
+    status = clGetMemObjectInfo(buffer_, CL_MEM_SIZE, bytes, &result, nullptr);
+    if (status != CL_SUCCESS) { Error(status); }
+    return result;
+  }
+
+  // Accessors to the private data-member
+  cl_mem operator()() const { return buffer_; }
+  cl_mem& operator()() { return buffer_; }
+ private:
+  cl_mem buffer_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_CLPP11_H_
+#endif
diff --git a/include/internal/database.h b/include/internal/database.h
new file mode 100644
index 00000000..dbbdd5c0
--- /dev/null
+++ b/include/internal/database.h
@@ -0,0 +1,90 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Database class, providing a static variable holding the actual database
+// information. The class also provides utility functions to search the database and to access a
+// found entry by parameter-key. The database itself is filled in the corresponding source-file and
+// partially also by the database/xxxxx.h files, in which kernel-specific parameters are found.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_DATABASE_H_
+#define CLBLAST_DATABASE_H_
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "internal/utilities.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+class Database {
+ public:
+
+  // Type alias for the database parameters
+  using Parameters = std::unordered_map<std::string,size_t>;
+
+  // Structures for content inside the database
+  struct DatabaseDevice {
+    const std::string name;
+    const Parameters parameters;
+  };
+  struct DatabaseVendor {
+    const cl_device_type type;
+    const std::string name;
+    const std::vector<DatabaseDevice> devices;
+  };
+  struct DatabaseEntry {
+    const std::string kernel;
+    const Precision precision;
+    const std::vector<DatabaseVendor> vendors;
+  };
+
+  // The default vendor or device
+  static constexpr auto kDefault = "Default";
+
+  // The database consists of separate database entries, stored together in a vector
+  static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
+  static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
+  static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
+  static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
+  static const DatabaseEntry TraSingle, TraDouble, TraComplexSingle, TraComplexDouble;
+  static const DatabaseEntry PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble;
+  static const std::vector<DatabaseEntry> database;
+
+  // The constructor
+  explicit Database(const CommandQueue &queue, const std::vector<std::string> &routines,
+                    const Precision precision);
+
+  // Accessor of values by key
+  size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
+
+  // Obtain a list of OpenCL pre-processor defines based on the parameters
+  std::string GetDefines() const;
+
+ private:
+  Parameters Search(const std::string &this_kernel, const cl_device_type this_type,
+                    const std::string &this_vendor, const std::string &this_device,
+                    const Precision this_precision) const;
+
+  // Tests equality between a database-vendor string and an OpenCL vendor string
+  bool VendorEqual(const std::string &db_vendor, const std::string &cl_vendor) const;
+
+  // Found parameters suitable for this device/kernel
+  Parameters parameters_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_DATABASE_H_
+#endif
diff --git a/include/internal/database/copy.h b/include/internal/database/copy.h
new file mode 100644
index 00000000..b9335fc9
--- /dev/null
+++ b/include/internal/database/copy.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Copy kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopySingle = {
+  "Copy", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+        { "Tesla K20m",       { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_WPT",2}, {"COPY_VW",4} } },
+        { "Tesla K40m",       { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",4}, {"COPY_VW",4} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",4} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyDouble = {
+  "Copy", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "Tesla K20m",       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+        { "Tesla K40m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyComplexSingle = {
+  "Copy", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "Tesla K20m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",1} } },
+        { "Tesla K40m",       { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::CopyComplexDouble = {
+  "Copy", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "Tesla K20m",       { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+        { "Tesla K40m",       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/pad.h b/include/internal/database/pad.h
new file mode 100644
index 00000000..5af75308
--- /dev/null
+++ b/include/internal/database/pad.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Pad kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadSingle = {
+  "Pad", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "Tesla K40m",       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadDouble = {
+  "Pad", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K40m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadComplexSingle = {
+  "Pad", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K20m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Tesla K40m",       { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadComplexDouble = {
+  "Pad", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K20m",       { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Tesla K40m",       { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/padtranspose.h b/include/internal/database/padtranspose.h
new file mode 100644
index 00000000..f1127d60
--- /dev/null
+++ b/include/internal/database/padtranspose.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the PadTranspose kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraSingle = {
+  "PadTranspose", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+        { "Tesla K40m",       { {"PADTRA_TILE",32}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraDouble = {
+  "PadTranspose", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraComplexSingle = {
+  "PadTranspose", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::PadTraComplexDouble = {
+  "PadTranspose", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K20m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+        { "Tesla K40m",       { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/transpose.h b/include/internal/database/transpose.h
new file mode 100644
index 00000000..0814eb8a
--- /dev/null
+++ b/include/internal/database/transpose.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Transpose kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraSingle = {
+  "Transpose", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraDouble = {
+  "Transpose", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraComplexSingle = {
+  "Transpose", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::TraComplexDouble = {
+  "Transpose", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Tesla K20m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+        { "Tesla K40m",       { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h
new file mode 100644
index 00000000..c331945a
--- /dev/null
+++ b/include/internal/database/xaxpy.h
@@ -0,0 +1,129 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Xaxpy kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpySingle = {
+  "Xaxpy", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",1}, {"VW",2} } },
+        { "Tesla K20m",       { {"WGS",128}, {"WPT",2}, {"VW",2} } },
+        { "Tesla K40m",       { {"WGS",128}, {"WPT",1}, {"VW",4} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",2} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"WGS",512}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyDouble = {
+  "Xaxpy", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+        { "Tesla K20m",       { {"WGS",512}, {"WPT",1}, {"VW",2} } },
+        { "Tesla K40m",       { {"WGS",64}, {"WPT",1}, {"VW",2} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+  }
+};
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyComplexSingle = {
+  "Xaxpy", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+        { "Tesla K20m",       { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+        { "Tesla K40m",       { {"WGS",128}, {"WPT",2}, {"VW",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XaxpyComplexDouble = {
+  "Xaxpy", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"WGS",128}, {"WPT",2}, {"VW",1} } },
+        { "Tesla K20m",       { {"WGS",256}, {"WPT",1}, {"VW",1} } },
+        { "Tesla K40m",       { {"WGS",64}, {"WPT",2}, {"VW",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"WGS",64}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"WGS",128}, {"WPT",1}, {"VW",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h
new file mode 100644
index 00000000..edf41e12
--- /dev/null
+++ b/include/internal/database/xgemm.h
@@ -0,0 +1,133 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the Xgemm kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmSingle = {
+  "Xgemm", Precision::kSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"MWG",128}, {"NWG",64}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { "Tesla K20m",       { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",4}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { "Tesla K40m",       { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { kDefault,           { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",8}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmDouble = {
+  "Xgemm", Precision::kDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { "Tesla K20m",       { {"MWG",64}, {"NWG",128}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",32}, {"KWI",8}, {"VWM",2}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+        { "Tesla K40m",       { {"MWG",64}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
+        { kDefault,           { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmComplexSingle = {
+  "Xgemm", Precision::kComplexSingle, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+        { "Tesla K20m",       { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",8}, {"KWI",8}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+        { "Tesla K40m",       { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",0}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+        { kDefault,           { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+        { "Iris",             { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemmComplexDouble = {
+  "Xgemm", Precision::kComplexDouble, {
+    { // NVIDIA GPUs
+      CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
+        { "GeForce GTX 480",  { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
+        { "Tesla K20m",       { {"MWG",16}, {"NWG",128}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",8}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
+        { "Tesla K40m",       { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",1} } },
+        { kDefault,           { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
+      }
+    },
+    { // AMD GPUs
+      CL_DEVICE_TYPE_GPU, "AMD", {
+        { "Tahiti",           { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+    { // Intel GPUs
+      CL_DEVICE_TYPE_GPU, "Intel", {
+      }
+    },
+    { // Default
+      CL_DEVICE_TYPE_ALL, kDefault, {
+        { kDefault,           { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
+      }
+    },
+  }
+};
+// =================================================================================================
+} // namespace clblast
diff --git a/include/internal/routine.h b/include/internal/routine.h
new file mode 100644
index 00000000..42357d8d
--- /dev/null
+++ b/include/internal/routine.h
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements all the basic functionality for the BLAS routines. This class serves as a
+// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
+// compiling the OpenCL kernel, connecting to the database, etc.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINE_H_
+#define CLBLAST_ROUTINE_H_
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/database.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+class Routine {
+ public:
+
+  // Khronos OpenCL extensions
+  const std::string kKhronosHalfPrecision = "cl_khr_fp16";
+  const std::string kKhronosDoublePrecision = "cl_khr_fp64";
+
+  // New data-type:tThe cache of compiled OpenCL programs, along with some meta-data
+  struct ProgramCache {
+    Program program;
+    std::string device_name;
+    Precision precision;
+    std::vector<std::string> routines;
+
+    // Finds out whether the properties match
+    bool MatchInCache(const std::string &ref_name, const Precision &ref_precision,
+                      const std::vector<std::string> &ref_routines) {
+      auto ref_size = ref_routines.size();
+      if (device_name == ref_name && precision == ref_precision && routines.size() == ref_size) {
+        auto found_match = true;
+        for (auto i=size_t{0}; i<ref_size; ++i) {
+          if (routines[i] != ref_routines[i]) { found_match = false; }
+        }
+        return found_match;
+      }
+      return false;
+    }
+  };
+
+  // The actual cache, implemented as a vector of the above data-type
+  static std::vector<ProgramCache> program_cache_;
+
+  // Helper functions which check for errors in the status code
+  static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
+
+  // Base class constructor
+  explicit Routine(CommandQueue &queue, Event &event,
+                   const std::vector<std::string> &routines, const Precision precision);
+
+  // Set-up phase of the kernel
+  StatusCode SetUp(const std::string &routine_source);
+
+ protected:
+  
+  // Runs a kernel given the global and local thread sizes
+  StatusCode RunKernel(const Kernel &kernel, std::vector<size_t> &global,
+                       const std::vector<size_t> &local);
+
+  // Tests for valid inputs of matrices A, B, and C
+  StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer &buffer,
+                         const size_t offset, const size_t ld, const size_t data_size);
+  StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer &buffer,
+                         const size_t offset, const size_t ld, const size_t data_size);
+  StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer &buffer,
+                         const size_t offset, const size_t ld, const size_t data_size);
+
+  // Tests for valid inputs of vectors X and Y
+  StatusCode TestVectorX(const size_t n, const Buffer &buffer, const size_t offset,
+                         const size_t inc, const size_t data_size);
+  StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
+                         const size_t inc, const size_t data_size);
+
+  // Copies/transposes a matrix and padds/unpads it
+  StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
+                                    const size_t src_ld, const size_t src_offset,
+                                    const Buffer &src,
+                                    const size_t dest_one, const size_t dest_two,
+                                    const size_t dest_ld, const size_t dest_offset,
+                                    const Buffer &dest,
+                                    const bool do_transpose, const bool pad,
+                                    const Program &program);
+  
+  // Queries the cache and retrieve either a matching program or a boolean whether a match exists.
+  // The first assumes that the program is available in the cache and will throw an exception
+  // otherwise.
+  Program GetProgramFromCache() const;
+  bool ProgramIsInCache() const;
+
+  // Non-static variable for the precision. Note that the same variable (but static) might exist in
+  // a derived class.
+  const Precision precision_;
+
+  // The OpenCL objects, accessible only from derived classes
+  CommandQueue queue_;
+  Event event_;
+  const Context context_;
+  const Device device_;
+
+  // OpenCL device properties
+  const std::string device_name_;
+  const cl_uint max_work_item_dimensions_;
+  const std::vector<size_t> max_work_item_sizes_;
+  const size_t max_work_group_size_;
+
+  // Connection to the database for all the device-specific parameters
+  const Database db_;
+  const std::vector<std::string> routines_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINE_H_
+#endif
diff --git a/include/internal/routines/xaxpy.h b/include/internal/routines/xaxpy.h
new file mode 100644
index 00000000..e548e553
--- /dev/null
+++ b/include/internal/routines/xaxpy.h
@@ -0,0 +1,42 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAXPY_H_
+#define CLBLAST_ROUTINES_XAXPY_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xaxpy: public Routine {
+ public:
+  Xaxpy(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoAxpy(const size_t n, const T alpha,
+                    const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer &y_buffer, const size_t y_offset, const size_t y_inc);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAXPY_H_
+#endif
diff --git a/include/internal/routines/xgemm.h b/include/internal/routines/xgemm.h
new file mode 100644
index 00000000..7ad4fcfb
--- /dev/null
+++ b/include/internal/routines/xgemm.h
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGEMM_H_
+#define CLBLAST_ROUTINES_XGEMM_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgemm: public Routine {
+ public:
+  Xgemm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                    const size_t m, const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGEMM_H_
+#endif
diff --git a/include/internal/routines/xsymm.h b/include/internal/routines/xsymm.h
new file mode 100644
index 00000000..c6545164
--- /dev/null
+++ b/include/internal/routines/xsymm.h
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm routine. It is based on the generalized matrix multiplication
+// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the
+// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by
+// transforming it into a general matrix, and then calls the regular GEMM code.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYMM_H_
+#define CLBLAST_ROUTINES_XSYMM_H_
+
+#include "internal/routines/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsymm: public Xgemm<T> {
+ public:
+
+  // Uses several variables from the Routine class
+  using Routine::db_;
+  using Routine::context_;
+
+  // Uses several helper functions from the Routine class
+  using Routine::RunKernel;
+  using Routine::ErrorIn;
+  using Routine::TestMatrixA;
+  using Routine::GetProgramFromCache;
+
+  // Uses the regular Xgemm routine
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xsymm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYMM_H_
+#endif
diff --git a/include/internal/tuning.h b/include/internal/tuning.h
new file mode 100644
index 00000000..7768888c
--- /dev/null
+++ b/include/internal/tuning.h
@@ -0,0 +1,53 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the header for the tuner functions. This is only used for the optional
+// and stand-alone tuner binaries and not part of the core of CLBlast. The convention used here is
+// that X and Y are vectors, while A, B, and C are matrices.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TUNING_H_
+#define CLBLAST_TUNING_H_
+
+#include <vector>
+#include <functional>
+
+#include <cltune.h>
+
+namespace clblast {
+// =================================================================================================
+
+// Functions with two or three OpenCL memory buffers
+template <typename T>
+using Tuner2 = std::function<void(const Arguments<T>&,
+                                  const std::vector<T>&, std::vector<T>&,
+                                  cltune::Tuner&)>;
+template <typename T>
+using Tuner3 = std::function<void(const Arguments<T>&,
+                                  const std::vector<T>&, const std::vector<T>&, std::vector<T>&,
+                                  cltune::Tuner&)>;
+
+// Tuner for vector-vector input
+template <typename T>
+void TunerXY(int argc, char* argv[], const Tuner2<T> &tune_function);
+
+// Tuner for matrix-matrix input
+template <typename T>
+void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function);
+
+// Tuner for matrix-matrix-matrix input
+template <typename T>
+void TunerABC(int argc, char* argv[], const Tuner3<T> &tune_function);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TUNING_H_
+#endif
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
new file mode 100644
index 00000000..af04dfdb
--- /dev/null
+++ b/include/internal/utilities.h
@@ -0,0 +1,174 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides declarations for the common (test) utility functions such as a command-line
+// argument parser. On top of this, it serves as the 'common' header, including the C++ OpenCL
+// wrapper. These utilities are not only used for CLBlast, but also included as part of the tuners,
+// the performance client and the correctness testers.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_UTILITIES_H_
+#define CLBLAST_UTILITIES_H_
+
+#include <string>
+#include <functional>
+#include <complex>
+
+#include "clblast.h"
+#include "internal/clpp11.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Shorthands for complex data-types
+using float2 = std::complex<float>;
+using double2 = std::complex<double>;
+
+// =================================================================================================
+
+// The routine-specific arguments in string form
+constexpr auto kArgM = "m";
+constexpr auto kArgN = "n";
+constexpr auto kArgK = "k";
+constexpr auto kArgLayout = "layout";
+constexpr auto kArgATransp = "transA";
+constexpr auto kArgBTransp = "transB";
+constexpr auto kArgSide = "side";
+constexpr auto kArgTriangle = "triangle";
+constexpr auto kArgXInc = "incx";
+constexpr auto kArgYInc = "incy";
+constexpr auto kArgXOffset = "offx";
+constexpr auto kArgYOffset = "offy";
+constexpr auto kArgALeadDim = "lda";
+constexpr auto kArgBLeadDim = "ldb";
+constexpr auto kArgCLeadDim = "ldc";
+constexpr auto kArgAOffset = "offa";
+constexpr auto kArgBOffset = "offb";
+constexpr auto kArgCOffset = "offc";
+constexpr auto kArgAlpha = "alpha";
+constexpr auto kArgBeta = "beta";
+
+// The tuner-specific arguments in string form
+constexpr auto kArgFraction = "fraction";
+
+// The client-specific arguments in string form
+constexpr auto kArgCompareclblas = "clblas";
+constexpr auto kArgStepSize = "step";
+constexpr auto kArgNumSteps = "num_steps";
+constexpr auto kArgNumRuns = "runs";
+
+// The common arguments in string form
+constexpr auto kArgPlatform = "platform";
+constexpr auto kArgDevice = "device";
+constexpr auto kArgPrecision = "precision";
+constexpr auto kArgHelp = "h";
+constexpr auto kArgQuiet = "q";
+constexpr auto kArgNoAbbreviations = "no_abbrv";
+
+// =================================================================================================
+
+// Structure containing all possible arguments for test clients, including their default values
+template <typename T>
+struct Arguments {
+  // Routine-specific arguments
+  size_t m = 0;
+  size_t n = 0;
+  size_t k = 0;
+  Layout layout = Layout::kRowMajor;
+  Transpose a_transpose = Transpose::kNo;
+  Transpose b_transpose = Transpose::kNo;
+  Side side = Side::kLeft;
+  Triangle triangle = Triangle::kUpper;
+  size_t x_inc = 1;
+  size_t y_inc = 1;
+  size_t x_offset = 0;
+  size_t y_offset = 0;
+  size_t a_ld = 0;
+  size_t b_ld = 0;
+  size_t c_ld = 0;
+  size_t a_offset = 0;
+  size_t b_offset = 0;
+  size_t c_offset = 0;
+  T alpha = T{1.0};
+  T beta = T{1.0};
+  // Tuner-specific arguments
+  double fraction = 1.0;
+  // Client-specific arguments
+  bool compare_clblas = 1;
+  size_t step = 1;
+  size_t num_steps = 0;
+  size_t num_runs = 10;
+  // Common arguments
+  size_t platform_id = 0;
+  size_t device_id = 0;
+  Precision precision = Precision::kSingle;
+  bool print_help = false;
+  bool silent = false;
+  bool no_abbrv = false;
+};
+
+// =================================================================================================
+
+// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
+// data-types such as the Layout and Transpose data-types.
+template <typename T>
+std::string ToString(T value);
+
+// =================================================================================================
+
+// Helper for the function "GetArgument"
+template <typename T>
+T ConvertArgument(const char* value);
+
+// Basic argument parser, matching patterns in the form of "-option value" and "--option value"
+template <typename T>
+T GetArgument(const int argc, char *argv[], std::string &help,
+              const std::string &option, const T default_value);
+
+// Returns the precision only
+Precision GetPrecision(const int argc, char *argv[]);
+
+// As in "GetArgument", but now only checks whether an argument is given or not
+bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option);
+
+// =================================================================================================
+
+// Returns a random number to be used as a seed
+unsigned int GetRandomSeed();
+
+// Populates a vector with random data
+template <typename T>
+void PopulateVector(std::vector<T> &vector);
+
+// =================================================================================================
+
+// Returns a scalar with a default value
+template <typename T>
+T GetScalar();
+
+// =================================================================================================
+
+// Rounding functions
+size_t CeilDiv(const size_t x, const size_t y);
+size_t Ceil(const size_t x, const size_t y);
+
+// Returns whether or not 'a' is a multiple of 'b'
+bool IsMultiple(const size_t a, const size_t b);
+
+// =================================================================================================
+
+// Convert the precision enum into bytes, e.g. a double takes up 8 bytes
+size_t GetBytes(const Precision precision);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_UTILITIES_H_
+#endif
diff --git a/samples/sgemm.cc b/samples/sgemm.cc
new file mode 100644
index 00000000..f4015278
--- /dev/null
+++ b/samples/sgemm.cc
@@ -0,0 +1,108 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the SGEMM routine. It is a stand-alone example, but it does
+// requires the Khronos C++ OpenCL API header file (not included). The example uses C++ features,
+// but CLBlast can also be used using the regular C-style OpenCL API.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <chrono>
+#include <vector>
+
+// Includes the C++ OpenCL API. If not yet available, it can be found here:
+// https://www.khronos.org/registry/cl/api/1.1/cl.hpp
+#include <cl.hpp>
+
+// Includes the CLBlast library
+#include <clblast.h>
+
+// =================================================================================================
+
+// Example use of the single-precision Xgemm routine SGEMM
+int main() {
+
+  // OpenCL platform/device settings
+  const auto platform_id = 0;
+  const auto device_id = 0;
+
+  // Example SGEMM arguments
+  const size_t m = 128;
+  const size_t n = 64;
+  const size_t k = 512;
+  const float alpha = 0.7f;
+  const float beta = 1.0f;
+  const auto a_ld = k;
+  const auto b_ld = n;
+  const auto c_ld = n;
+
+  // Initializes the OpenCL platform
+  auto platforms = std::vector<cl::Platform>();
+  cl::Platform::get(&platforms);
+  if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
+  auto platform = platforms[platform_id];
+
+  // Initializes the OpenCL device (note: example for GPU devices only)
+  auto devices = std::vector<cl::Device>();
+  platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
+  if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
+  auto device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  auto context = cl::Context({device});
+  auto queue = cl::CommandQueue(context, device);
+  auto event = cl::Event();
+
+  // Populate host matrices with some example data
+  auto host_a = std::vector<float>(m*k);
+  auto host_b = std::vector<float>(n*k);
+  auto host_c = std::vector<float>(m*n);
+  for (auto &item: host_a) { item = 12.193f; }
+  for (auto &item: host_b) { item = -8.199f; }
+  for (auto &item: host_c) { item = 0.0f; }
+
+  // Copy the matrices to the device
+  auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(float));
+  auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(float));
+  auto device_c = cl::Buffer(context, CL_MEM_READ_WRITE, host_c.size()*sizeof(float));
+  queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(float), host_a.data());
+  queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(float), host_b.data());
+  queue.enqueueWriteBuffer(device_c, CL_TRUE, 0, host_c.size()*sizeof(float), host_c.data());
+
+  // Start the timer
+  auto start_time = std::chrono::steady_clock::now();
+
+  // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision.
+  auto queue_plain = queue();
+  auto event_plain = event();
+  auto status = Gemm(clblast::Layout::kRowMajor,
+                     clblast::Transpose::kNo, clblast::Transpose::kNo,
+                     m, n, k,
+                     alpha,
+                     device_a(), 0, a_ld,
+                     device_b(), 0, b_ld,
+                     beta,
+                     device_c(), 0, c_ld,
+                     &queue_plain, &event_plain);
+
+  // Record the execution time
+  event.wait();
+  auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+  auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
+
+  // Example completed. See "clblast.h" for status codes (0 -> success).
+  printf("Completed in %.3lf ms with status %d\n", time_ms, status);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/clblast.cc b/src/clblast.cc
new file mode 100644
index 00000000..72de3b24
--- /dev/null
+++ b/src/clblast.cc
@@ -0,0 +1,224 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements all the BLAS API calls. In all cases, it does not much more than creating
+// a new object of the appropriate type, and calling the main routine on that object. It forwards
+// all status codes to the caller.
+//
+// =================================================================================================
+
+#include <string>
+
+#include "clblast.h"
+
+// BLAS level-1 includes
+#include "internal/routines/xaxpy.h"
+
+// BLAS level-3 includes
+#include "internal/routines/xgemm.h"
+#include "internal/routines/xsymm.h"
+
+namespace clblast {
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+
+// AXPY
+template <typename T>
+StatusCode Axpy(const size_t n, const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto xaxpy = Xaxpy<T>(queue_cpp, event_cpp);
+
+  // Loads the kernel source-code as an include (C++11 raw string literal)
+  std::string kernel_source =
+  #include "kernels/xaxpy.opencl"
+  auto status = xaxpy.SetUp(kernel_source);
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return xaxpy.DoAxpy(n, alpha,
+                      Buffer(x_buffer), x_offset, x_inc,
+                      Buffer(y_buffer), y_offset, y_inc);
+}
+template StatusCode Axpy<float>(const size_t, const float,
+                                const cl_mem, const size_t, const size_t,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Axpy<double>(const size_t, const double,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Axpy<float2>(const size_t, const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Axpy<double2>(const size_t, const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+
+// GEMM
+template <typename T>
+StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
+                const size_t m, const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto xgemm = Xgemm<T>(queue_cpp, event_cpp);
+
+  // Loads the kernel source-code as an include (C++11 raw string literal)
+  std::string common_source1 =
+  #include "kernels/copy.opencl"
+  std::string common_source2 =
+  #include "kernels/pad.opencl"
+  std::string common_source3 =
+  #include "kernels/transpose.opencl"
+  std::string common_source4 =
+  #include "kernels/padtranspose.opencl"
+  std::string kernel_source =
+  #include "kernels/xgemm.opencl"
+  auto status = xgemm.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
+                            kernel_source);
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return xgemm.DoGemm(layout, transpose_a, transpose_b,
+                      m, n, k,
+                      alpha,
+                      Buffer(a_buffer), a_offset, a_ld,
+                      Buffer(b_buffer), b_offset, b_ld,
+                      beta,
+                      Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Gemm<float>(const Layout, const Transpose, const Transpose,
+                                const size_t, const size_t, const size_t,
+                                const float,
+                                const cl_mem, const size_t, const size_t,
+                                const cl_mem, const size_t, const size_t,
+                                const float,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Gemm<double>(const Layout, const Transpose, const Transpose,
+                                 const size_t, const size_t, const size_t,
+                                 const double,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t,
+                                 const double,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+/*
+template StatusCode Gemm<float2>(const Layout, const Transpose, const Transpose,
+                                 const size_t, const size_t, const size_t,
+                                 const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t,
+                                 const float2,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Gemm<double2>(const Layout, const Transpose, const Transpose,
+                                  const size_t, const size_t, const size_t,
+                                  const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t,
+                                  const double2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+*/
+
+// =================================================================================================
+
+// SYMM
+template <typename T>
+StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto xsymm = Xsymm<T>(queue_cpp, event_cpp);
+
+  // Loads the kernel source-code as an include (C++11 raw string literal)
+  std::string common_source1 =
+  #include "kernels/copy.opencl"
+  std::string common_source2 =
+  #include "kernels/pad.opencl"
+  std::string common_source3 =
+  #include "kernels/transpose.opencl"
+  std::string common_source4 =
+  #include "kernels/padtranspose.opencl"
+  std::string kernel_source =
+  #include "kernels/xgemm.opencl"
+  auto status = xsymm.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
+                            kernel_source);
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return xsymm.DoSymm(layout, side, triangle,
+                      m, n,
+                      alpha,
+                      Buffer(a_buffer), a_offset, a_ld,
+                      Buffer(b_buffer), b_offset, b_ld,
+                      beta,
+                      Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Symm<float>(const Layout, const Side, const Triangle,
+                                const size_t, const size_t,
+                                const float,
+                                const cl_mem, const size_t, const size_t,
+                                const cl_mem, const size_t, const size_t,
+                                const float,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Symm<double>(const Layout, const Side, const Triangle,
+                                 const size_t, const size_t,
+                                 const double,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t,
+                                 const double,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+/*
+template StatusCode Symm<float2>(const Layout, const Side, const Triangle,
+                                 const size_t, const size_t,
+                                 const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t,
+                                 const float2,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Symm<double2>(const Layout, const Side, const Triangle,
+                                  const size_t, const size_t,
+                                  const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t,
+                                  const double2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+*/
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database.cc b/src/database.cc
new file mode 100644
index 00000000..beaa122b
--- /dev/null
+++ b/src/database.cc
@@ -0,0 +1,112 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Database class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/database.h"
+#include "internal/database/xaxpy.h"
+#include "internal/database/xgemm.h"
+#include "internal/database/copy.h"
+#include "internal/database/pad.h"
+#include "internal/database/transpose.h"
+#include "internal/database/padtranspose.h"
+
+#include "internal/utilities.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Initializes the database
+const std::vector<Database::DatabaseEntry> Database::database = {
+  XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
+  XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
+  CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
+  PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
+  TraSingle, TraDouble, TraComplexSingle, TraComplexDouble,
+  PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble
+};
+
+// =================================================================================================
+
+// Constructor, computing device properties and populating the parameter-vector from the database
+Database::Database(const CommandQueue &queue, const std::vector<std::string> &kernels,
+                   const Precision precision):
+  parameters_{} {
+
+  // Finds information of the current device
+  auto device = queue.GetDevice();
+  auto device_type = device.Type();
+  auto device_vendor = device.Vendor();
+  auto device_name = device.Name();
+
+  // Iterates over all kernels to include, and retrieves the parameters for each of them
+  for (auto &kernel: kernels) {
+    auto search_result = Search(kernel, device_type, device_vendor, device_name, precision);
+    parameters_.insert(search_result.begin(), search_result.end());
+  }
+}
+
+// =================================================================================================
+
+// Returns a list of OpenCL pre-processor defines in string form
+std::string Database::GetDefines() const {
+  std::string defines{};
+  for (auto &parameter: parameters_) {
+    defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n";
+  }
+  return defines;
+}
+
+// =================================================================================================
+
+// Searches the database for the right kernel and precision
+Database::Parameters Database::Search(const std::string &this_kernel,
+                                      const cl_device_type this_type,
+                                      const std::string &this_vendor,
+                                      const std::string &this_device,
+                                      const Precision this_precision) const {
+  for (auto &db: database) {
+    if (db.kernel == this_kernel && db.precision == this_precision) {
+
+      // Searches for the right vendor and device type, or selects the default if unavailable. This
+      // assumes that the default vendor / device type is last in the database.
+      for (auto &vendor: db.vendors) {
+        if (VendorEqual(vendor.name, this_vendor) &&
+            (vendor.type == this_type || vendor.type == CL_DEVICE_TYPE_ALL)) {
+
+          // Searches for the right device. If the current device is unavailable, selects the vendor
+          // default parameters. This assumes the default is last in the database.
+          for (auto &device: vendor.devices) {
+            if (device.name == this_device || device.name == kDefault) {
+
+              // Sets the parameters accordingly
+              return device.parameters;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // If we reached this point, something is wrong
+  throw std::runtime_error("Database error, could not find a suitable entry");
+}
+
+// Determines the equality between two vendor names. This is implemented because vendor names can
+// be ambigious and might change between different SDK or driver versions.
+bool Database::VendorEqual(const std::string &db_vendor, const std::string &cl_vendor) const {
+  if (db_vendor == kDefault) { return true; }
+  if (db_vendor == cl_vendor) { return true; }
+  return false;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
new file mode 100644
index 00000000..154265e4
--- /dev/null
+++ b/src/kernels/common.opencl
@@ -0,0 +1,120 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common defines and type-defs for the CLBlast OpenCL kernels.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this file is used outside of the CLBlast library.
+#ifndef PRECISION
+  #define PRECISION 32      // Data-types: single or double precision, complex or regular
+#endif
+
+// =================================================================================================
+
+// Enable support for double-precision
+#if PRECISION == 64 || PRECISION == 6464
+  #if __OPENCL_VERSION__ <= CL_VERSION_1_1
+     #pragma OPENCL EXTENSION cl_khr_fp64: enable
+  #endif
+#endif
+
+// Single-precision
+#if PRECISION == 32
+  typedef float real;
+  typedef float2 real2;
+  typedef float4 real4;
+  typedef float8 real8;
+  typedef float16 real16;
+  #define ZERO 0.0f
+
+// Double-precision 
+#elif PRECISION == 64
+  typedef double real;
+  typedef double2 real2;
+  typedef double4 real4;
+  typedef double8 real8;
+  typedef double16 real16;
+  #define ZERO 0.0
+
+// Complex single-precision
+#elif PRECISION == 3232
+  typedef struct cfloat {float x; float y;} real;
+  typedef struct cfloat2 {real x; real y;} real2;
+  typedef struct cfloat4 {real x; real y; real z; real w;} real4;
+  typedef struct cfloat8 {real s0; real s1; real s2; real s3;
+                          real s4; real s5; real s6; real s7;} real8;
+  typedef struct cfloat16 {real s0; real s1; real s2; real s3;
+                           real s4; real s5; real s6; real s7;
+                           real s8; real s9; real sA; real sB;
+                           real sC; real sD; real sE; real sF;} real16;
+  #define ZERO 0.0f
+
+// Complex Double-precision
+#elif PRECISION == 6464
+  typedef struct cdouble {double x; double y;} real;
+  typedef struct cdouble2 {real x; real y;} real2;
+  typedef struct cdouble4 {real x; real y; real z; real w;} real4;
+  typedef struct cdouble8 {real s0; real s1; real s2; real s3;
+                           real s4; real s5; real s6; real s7;} real8;
+  typedef struct cdouble16 {real s0; real s1; real s2; real s3;
+                            real s4; real s5; real s6; real s7;
+                            real s8; real s9; real sA; real sB;
+                            real sC; real sD; real sE; real sF;} real16;
+  #define ZERO 0.0
+#endif
+
+// =================================================================================================
+
+// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction
+#define USE_CL_MAD 0
+
+// Sets a variable to zero
+#if PRECISION == 3232 || PRECISION == 6464
+  #define SetToZero(a) a.x = ZERO; a.y = ZERO
+#else
+  #define SetToZero(a) a = ZERO
+#endif
+
+// Multiply two complex variables (used in the define below)
+#if PRECISION == 3232 || PRECISION == 6464
+  #define MulReal(a, b) a.x*b.x - a.y*b.y
+  #define MulImag(a, b) a.x*b.y + a.y*b.x
+#endif
+
+// The scalar multiply-add function
+#if PRECISION == 3232 || PRECISION == 6464
+  #define MultiplyAdd(c, a, b) c.x += MulReal(a,b); c.y += MulImag(a,b)
+#else
+  #if USE_CL_MAD == 1
+    #define MultiplyAdd(c, a, b) c = mad(a, b, c)
+  #else
+    #define MultiplyAdd(c, a, b) c += a * b
+  #endif
+#endif
+
+// The scalar AXPBY function
+#if PRECISION == 3232 || PRECISION == 6464
+  #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
+#else
+  #define AXPBY(e, a, b, c, d) e = a*b + c*d
+#endif
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/copy.opencl b/src/kernels/copy.opencl
new file mode 100644
index 00000000..f95b476b
--- /dev/null
+++ b/src/kernels/copy.opencl
@@ -0,0 +1,73 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS routines. This file contains
+// kernels to copy matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef COPY_DIMX
+  #define COPY_DIMX 8      // Local workgroup size in the first dimension (x)
+#endif
+#ifndef COPY_DIMY
+  #define COPY_DIMY 8      // Local workgroup size in the second dimension (y)
+#endif
+#ifndef COPY_WPT
+  #define COPY_WPT 1       // Work per thread in the first dimension (x)
+#endif
+#ifndef COPY_VW
+  #define COPY_VW 1        // Vector width in the second dimension (y)
+#endif
+
+// =================================================================================================
+
+// Data-widths
+#if COPY_VW == 1
+  typedef real realC;
+#elif COPY_VW == 2
+  typedef real2 realC;
+#elif COPY_VW == 4
+  typedef real4 realC;
+#elif COPY_VW == 8
+  typedef real8 realC;
+#elif COPY_VW == 16
+  typedef real16 realC;
+#endif
+
+// =================================================================================================
+
+// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
+// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
+__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+__kernel void CopyMatrix(const int ld,
+                         __global const realC* restrict src,
+                         __global realC* dest) {
+  #pragma unroll
+  for (int w_one=0; w_one<COPY_WPT; ++w_one) {
+    const int id_one = get_global_id(0);
+    const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
+    const int id = id_two*(ld/COPY_VW) + id_one;
+    dest[id] = src[id];
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/pad.opencl b/src/kernels/pad.opencl
new file mode 100644
index 00000000..ccaeb9d6
--- /dev/null
+++ b/src/kernels/pad.opencl
@@ -0,0 +1,180 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS routines. This file contains
+// kernels to copy and pad matrices in various ways, including:
+// 1) copying into a larger matrix by adding padding
+// 2) copying into a smaller matrix by removing padding
+// 3) from upper/lower triangle into a full matrix
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef PAD_DIMX
+  #define PAD_DIMX 8      // Local workgroup size in the first dimension (x)
+#endif
+#ifndef PAD_DIMY
+  #define PAD_DIMY 8      // Local workgroup size in the second dimension (y)
+#endif
+#ifndef PAD_WPTX
+  #define PAD_WPTX 1      // Work per thread in the first dimension (x)
+#endif
+#ifndef PAD_WPTY
+  #define PAD_WPTY 1      // Work per thread in the second dimension (y)
+#endif
+
+// =================================================================================================
+
+// Copies a matrix from source to destination. The output is padded with zero values in case the
+// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
+// value and offset can be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void PadMatrix(const int src_one, const int src_two,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_one, const int dest_two,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_two && id_one < dest_one) {
+
+        // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the
+        // value to be written to zero.
+        real value;
+        SetToZero(value);
+        if (id_two < src_two && id_one < src_one) {
+          value = src[id_two*src_ld + id_one + src_offset];
+        }
+
+        // Stores the value in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
+// writes only the actual data back to the destination matrix. Again, the ld value and offset can
+// be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void UnPadMatrix(const int src_one, const int src_two,
+                          const int src_ld, const int src_offset,
+                          __global const real* restrict src,
+                          const int dest_one, const int dest_two,
+                          const int dest_ld, const int dest_offset,
+                          __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_two && id_one < dest_one) {
+
+        // Copies the value into the destination matrix. This is always within bounds of the source
+        // matrix, as we know that the destination matrix is smaller than the source.
+        dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-symmetric matrix
+        real value;
+        SetToZero(value);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) { value = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { value = src[id_one*src_ld + id_two + src_offset]; }
+        }
+
+        // Stores the value in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-symmetric matrix
+        real value;
+        SetToZero(value);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) { value = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { value = src[id_one*src_ld + id_two + src_offset]; }
+        }
+
+        // Stores the value in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/padtranspose.opencl b/src/kernels/padtranspose.opencl
new file mode 100644
index 00000000..67cbf341
--- /dev/null
+++ b/src/kernels/padtranspose.opencl
@@ -0,0 +1,150 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// kernels to transpose matrices in various ways, including:
+// 1) transposing into a larger matrix by adding padding
+// 2) transposing into a smaller matrix by removing padding
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef PADTRA_TILE
+  #define PADTRA_TILE 8   // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef PADTRA_WPT
+  #define PADTRA_WPT 1    // Amount of work per thread
+#endif
+#ifndef PADTRA_PAD
+  #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
+#endif
+
+// =================================================================================================
+
+// Same as PadCopyMatrix, but now also does the transpose
+__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+__kernel void PadTransposeMatrix(const int src_one, const int src_two,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_one, const int dest_two,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the source matrix. Note that the local and global dimensions
+      // do not correspond to each other!
+      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
+      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
+
+      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
+      // Otherwise, set the local memory value to zero.
+      real value;
+      SetToZero(value);
+      if (id_src_two < src_two && id_src_one < src_one) {
+        value = src[id_src_two*src_ld + id_src_one + src_offset];
+      }
+      tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+    }
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the destination matrix
+      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
+      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
+
+      // Stores the transposed value in the destination matrix
+      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// Same as UnPadCopyMatrix, but now also does the transpose
+__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+__kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
+                                   const int src_ld, const int src_offset,
+                                   __global const real* restrict src,
+                                   const int dest_one, const int dest_two,
+                                   const int dest_ld, const int dest_offset,
+                                   __global real* dest) {
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the source matrix. Note that the local and global dimensions
+      // do not correspond to each other!
+      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
+      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
+
+      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
+      if ((id_src_one < src_one) && (id_src_two < src_two)) {
+        real value = src[id_src_two*src_ld + id_src_one + src_offset];
+        tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+      }
+    }
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the destination matrix
+      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
+      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
+
+      // Stores the transposed value in the destination matrix
+      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/transpose.opencl b/src/kernels/transpose.opencl
new file mode 100644
index 00000000..79ab1688
--- /dev/null
+++ b/src/kernels/transpose.opencl
@@ -0,0 +1,168 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// kernels to transpose matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef TRA_DIM
+  #define TRA_DIM 8    // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef TRA_WPT
+  #define TRA_WPT 1    // Work per thread in one dimension and vector-width in the other
+#endif
+#ifndef TRA_PAD
+  #define TRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
+#endif
+
+// =================================================================================================
+
+// Data-widths
+#if TRA_WPT == 1
+  typedef real realT;
+#elif TRA_WPT == 2
+  typedef real2 realT;
+#elif TRA_WPT == 4
+  typedef real4 realT;
+#elif TRA_WPT == 8
+  typedef real8 realT;
+#elif TRA_WPT == 16
+  typedef real16 realT;
+#endif
+
+// =================================================================================================
+
+// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
+// offset. A more general version is available in 'padtranspose.opencl'.
+__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+__kernel void TransposeMatrix(const int ld,
+                              __global const realT* restrict src,
+                              __global realT* dest) {
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[TRA_WPT*TRA_DIM][TRA_WPT*TRA_DIM + TRA_PAD];
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
+
+    // Computes the identifiers for the source matrix. Note that the local and global dimensions
+    // do not correspond to each other!
+    const int id_one = get_group_id(1) * TRA_DIM + get_local_id(0);
+    const int id_two = (get_group_id(0) * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
+
+    // Loads data into the local memory
+    realT value = src[id_two*(ld/TRA_WPT) + id_one];
+    #if TRA_WPT == 1
+      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value;
+    #elif TRA_WPT == 2
+      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
+      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
+    #elif TRA_WPT == 4
+      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
+      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
+      tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.z;
+      tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.w;
+    #elif TRA_WPT == 8
+      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
+      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
+      tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
+      tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
+      tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
+      tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
+      tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
+      tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
+    #elif TRA_WPT == 16
+      tile[get_local_id(1)*TRA_WPT +  0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
+      tile[get_local_id(1)*TRA_WPT +  1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
+      tile[get_local_id(1)*TRA_WPT +  2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
+      tile[get_local_id(1)*TRA_WPT +  3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
+      tile[get_local_id(1)*TRA_WPT +  4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
+      tile[get_local_id(1)*TRA_WPT +  5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
+      tile[get_local_id(1)*TRA_WPT +  6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
+      tile[get_local_id(1)*TRA_WPT +  7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
+      tile[get_local_id(1)*TRA_WPT +  8][get_local_id(0)*TRA_WPT + w_one] = value.s8;
+      tile[get_local_id(1)*TRA_WPT +  9][get_local_id(0)*TRA_WPT + w_one] = value.s9;
+      tile[get_local_id(1)*TRA_WPT + 10][get_local_id(0)*TRA_WPT + w_one] = value.sA;
+      tile[get_local_id(1)*TRA_WPT + 11][get_local_id(0)*TRA_WPT + w_one] = value.sB;
+      tile[get_local_id(1)*TRA_WPT + 12][get_local_id(0)*TRA_WPT + w_one] = value.sC;
+      tile[get_local_id(1)*TRA_WPT + 13][get_local_id(0)*TRA_WPT + w_one] = value.sD;
+      tile[get_local_id(1)*TRA_WPT + 14][get_local_id(0)*TRA_WPT + w_one] = value.sE;
+      tile[get_local_id(1)*TRA_WPT + 15][get_local_id(0)*TRA_WPT + w_one] = value.sF;
+    #endif
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_two=0; w_two<TRA_WPT; ++w_two) {
+
+    // Computes the identifiers for the destination matrix
+    const int id_one = get_global_id(0);
+    const int id_two = get_global_id(1)*TRA_WPT + w_two;
+
+    // Stores the transposed value in the destination matrix
+    realT value;
+    #if TRA_WPT == 1
+      value = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+    #elif TRA_WPT == 2
+      value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+      value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
+    #elif TRA_WPT == 4
+      value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+      value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
+      value.z = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
+      value.w = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
+    #elif TRA_WPT == 8
+      value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+      value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
+      value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
+      value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
+      value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4];
+      value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5];
+      value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6];
+      value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7];
+    #elif TRA_WPT == 16
+      value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  0];
+      value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  1];
+      value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  2];
+      value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  3];
+      value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  4];
+      value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  5];
+      value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  6];
+      value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  7];
+      value.s8 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  8];
+      value.s9 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  9];
+      value.sA = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 10];
+      value.sB = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 11];
+      value.sC = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 12];
+      value.sD = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 13];
+      value.sE = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 14];
+      value.sF = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 15];
+    #endif
+    dest[id_two*(ld/TRA_WPT) + id_one] = value;
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/xaxpy.opencl b/src/kernels/xaxpy.opencl
new file mode 100644
index 00000000..40c6c3bd
--- /dev/null
+++ b/src/kernels/xaxpy.opencl
@@ -0,0 +1,128 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
+// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
+// support vector data-types.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef WGS
+  #define WGS 64     // The local work-group size
+#endif
+#ifndef WPT
+  #define WPT 1      // The amount of work-per-thread
+#endif
+#ifndef VW
+  #define VW 1       // Vector width of vectors X and Y
+#endif
+
+// =================================================================================================
+
+// Data-widths
+#if VW == 1
+  typedef real realV;
+#elif VW == 2
+  typedef real2 realV;
+#elif VW == 4
+  typedef real4 realV;
+#elif VW == 8
+  typedef real8 realV;
+#elif VW == 16
+  typedef real16 realV;
+#endif
+
+// =================================================================================================
+
+// The vectorized multiply-add function
+inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) {
+  #if VW == 1
+    MultiplyAdd(cvec, aval, bvec);
+  #elif VW == 2
+    MultiplyAdd(cvec.x, aval, bvec.x);
+    MultiplyAdd(cvec.y, aval, bvec.y);
+  #elif VW == 4
+    MultiplyAdd(cvec.x, aval, bvec.x);
+    MultiplyAdd(cvec.y, aval, bvec.y);
+    MultiplyAdd(cvec.z, aval, bvec.z);
+    MultiplyAdd(cvec.w, aval, bvec.w);
+  #elif VW == 8
+    MultiplyAdd(cvec.s0, aval, bvec.s0);
+    MultiplyAdd(cvec.s1, aval, bvec.s1);
+    MultiplyAdd(cvec.s2, aval, bvec.s2);
+    MultiplyAdd(cvec.s3, aval, bvec.s3);
+    MultiplyAdd(cvec.s4, aval, bvec.s4);
+    MultiplyAdd(cvec.s5, aval, bvec.s5);
+    MultiplyAdd(cvec.s6, aval, bvec.s6);
+    MultiplyAdd(cvec.s7, aval, bvec.s7);
+  #elif VW == 16
+    MultiplyAdd(cvec.s0, aval, bvec.s0);
+    MultiplyAdd(cvec.s1, aval, bvec.s1);
+    MultiplyAdd(cvec.s2, aval, bvec.s2);
+    MultiplyAdd(cvec.s3, aval, bvec.s3);
+    MultiplyAdd(cvec.s4, aval, bvec.s4);
+    MultiplyAdd(cvec.s5, aval, bvec.s5);
+    MultiplyAdd(cvec.s6, aval, bvec.s6);
+    MultiplyAdd(cvec.s7, aval, bvec.s7);
+    MultiplyAdd(cvec.s8, aval, bvec.s8);
+    MultiplyAdd(cvec.s9, aval, bvec.s9);
+    MultiplyAdd(cvec.sA, aval, bvec.sA);
+    MultiplyAdd(cvec.sB, aval, bvec.sB);
+    MultiplyAdd(cvec.sC, aval, bvec.sC);
+    MultiplyAdd(cvec.sD, aval, bvec.sD);
+    MultiplyAdd(cvec.sE, aval, bvec.sE);
+    MultiplyAdd(cvec.sF, aval, bvec.sF);
+  #endif
+  return cvec;
+}
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void Xaxpy(const int n, const real alpha,
+                    const __global real* restrict xgm, const int x_offset, const int x_inc,
+                    __global real* ygm, const int y_offset, const int y_inc) {
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  #pragma unroll
+  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+    MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xgm[id*x_inc + x_offset]);
+  }
+}
+
+// =================================================================================================
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void XaxpyFast(const int n, const real alpha,
+                         const __global realV* restrict xgm,
+                         __global realV* ygm) {
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id = w*get_global_size(0) + get_global_id(0);
+    ygm[id] = MultiplyAddVector(ygm[id], alpha, xgm[id]);
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/xgemm.opencl b/src/kernels/xgemm.opencl
new file mode 100644
index 00000000..facaf5dc
--- /dev/null
+++ b/src/kernels/xgemm.opencl
@@ -0,0 +1,570 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
+// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
+// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
+// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
+//
+// Matrices are accessed as follows:
+// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
+// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
+// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
+//
+// Or as an image (assuming column-major)
+//       K                      
+//    o-------o                 
+//    |       |                 
+//  N | [B^T] |                 
+//    |       |                 
+//    o-------o                 
+//        K               N     
+//    o-------o        o-----o  
+//  M |  [A]  |      M | [C] |  
+//    |       |        |     |  
+//    o-------o        o-----o  
+//                              
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef MWG
+  #define MWG 8      // Tile-size in dimension M (e.g. 64, 128)
+#endif
+#ifndef NWG
+  #define NWG 8      // Tile-size in dimension N (e.g. 64, 128)
+#endif
+#ifndef KWG
+  #define KWG 8      // Tile-size in dimension K (e.g. 8, 16)
+#endif
+#ifndef MDIMC
+  #define MDIMC 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef NDIMC
+  #define NDIMC 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef MDIMA
+  #define MDIMA 8    // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
+#endif
+#ifndef NDIMB
+  #define NDIMB 8    // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
+#endif
+#ifndef KWI
+  #define KWI 1      // Unroll factor of the KWG loop (smaller or equal than KWG)
+#endif
+#ifndef VWM
+  #define VWM 1      // Vector width of matrices A and C 
+#endif
+#ifndef VWN
+  #define VWN 1      // Vector width of matrix B
+#endif
+#ifndef STRM
+  #define STRM 0     // Use strided access within a thread in the M-dimension (1) or not (0)
+#endif
+#ifndef STRN
+  #define STRN 0     // Use strided access within a thread in the N-dimension (1) or not (0)
+#endif
+#ifndef SA
+  #define SA 0       // Use local/shared memory to cache matrix A (1) or not (0)
+#endif
+#ifndef SB
+  #define SB 0       // Use local/shared memory to cache matrix B (1) or not (0)
+#endif
+
+// Helper parameters based on the above tuning parameters
+#define MWI (MWG/MDIMC)               // Work per work-item (M-dimension)
+#define NWI (NWG/NDIMC)               // Work per work-item (N-dimension)
+#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
+#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
+#define MWA (MWG/MDIMA)               // Amount of loads-per-thread for matrix A (M-dimension)
+#define KWA (KWG/KDIMA)               // Amount of loads-per-thread for matrix A (K-dimension)
+#define KWB (KWG/KDIMB)               // Amount of loads-per-thread for matrix B (K-dimension)
+#define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
+
+// Settings
+#define USE_VECTOR_MAD 0              // Unroll (0) or don't (1) unroll the vector MAD manually
+
+// =================================================================================================
+
+// Data-widths in dimension M
+#if VWM == 1
+    typedef real realM;
+#elif VWM == 2
+    typedef real2 realM;
+#elif VWM == 4
+    typedef real4 realM;
+#elif VWM == 8
+    typedef real8 realM;
+#elif VWM == 16
+    typedef real16 realM;
+#endif
+
+// Data-widths in dimension N
+#if VWN == 1
+    typedef real realN;
+#elif VWN == 2
+    typedef real2 realN;
+#elif VWN == 4
+    typedef real4 realN;
+#elif VWN == 8
+    typedef real8 realN;
+#elif VWN == 16
+    typedef real16 realN;
+#endif
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix.
+#if SA == 1
+inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
+                           const int kSizeM, const int tid, const int kwg) {
+  const int la0 = tid % MDIMA;
+  const int la1 = tid / MDIMA;
+  #pragma unroll
+  for (int mia=0; mia<MWA/VWM; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWA; ++kia) {
+
+      // Computes the indices based on strided/non-strided access
+      #if STRM == 0
+        int mg = mia + la0*(MWA/VWM);
+      #elif STRM == 1
+        int mg = la0 + mia*MDIMA;
+      #endif
+
+      // Computes the indices for the global memory
+      int kg = kia + la1*KWA;
+      int idm = mg + get_group_id(0)*(MWG/VWM);
+      int idk = kg + kwg;
+
+      // Loads the data from global memory (not transposed) into the local memory
+      alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
+    }
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 1
+inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
+                           const int kSizeN, const int tid, const int kwg) {
+  const int lb0 = tid % NDIMB;
+  const int lb1 = tid / NDIMB;
+  #pragma unroll
+  for (int kib=0; kib<KWB; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWB/VWN; ++nib) {
+
+      // Computes the indices based on strided/non-strided access
+      #if STRN == 0
+        int ng = nib + lb0*(NWB/VWN);
+      #elif STRN == 1
+        int ng = lb0 + nib*NDIMB;
+      #endif
+
+      // Computes the indices for the global memory
+      int kg = kib + lb1*KWB;
+      int idn = ng + get_group_id(1)*(NWG/VWN);
+      int idk = kg + kwg;
+
+      // Loads the data from global memory (transposed) into the local memory
+      blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
+    }
+  }
+}
+#endif
+
+// =================================================================================================
+
+// Caches global off-chip memory directly into per-thread private memory (registers). This function
+// is specific for caching the A input matrix.
+#if SA == 0
+inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
+                             const int kSizeM, const int idk, const int kwg) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+
+    // Computes the indices based on strided/non-strided access
+    #if STRM == 0
+      int mg = mi + get_local_id(0)*(MWI/VWM);
+    #elif STRM == 1
+      int mg = get_local_id(0) + mi*MDIMC;
+    #endif
+
+    // Computes the indices for the global memory
+    int idm = mg + get_group_id(0)*(MWG/VWM);
+
+    // Loads the data from global memory (not transposed) and stores into registers
+    apm[mi] = agm[idk*(kSizeM/VWM) + idm];
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 0
+inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
+                             const int kSizeN, const int idk) {
+  #pragma unroll
+  for (int ni=0; ni<NWI/VWN; ++ni) {
+
+    // Computes the indices based on strided/non-strided access
+    #if STRN == 0
+      int ng = ni + get_local_id(1)*(NWI/VWN);
+    #elif STRN == 1
+      int ng = get_local_id(1) + ni*NDIMC;
+    #endif
+
+    // Computes the indices for the global memory
+    int idn = ng + get_group_id(1)*(NWG/VWN);
+
+    // Loads the data from global memory (transposed) and stores into registers
+    bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
+  }
+}
+#endif
+
+// =================================================================================================
+
+// Caches on-chip local memory into per-thread private memory (registers). This function is specific
+// for caching the A input matrix.
+#if SA == 1
+inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+    #if STRM == 0
+      int mg = mi + get_local_id(0)*(MWI/VWM);
+    #elif STRM == 1
+      int mg = get_local_id(0) + mi*MDIMC;
+    #endif
+    apm[mi] = alm[kg*(MWG/VWM) + mg];
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 1
+inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
+  #pragma unroll
+  for (int ni=0; ni<NWI/VWN; ++ni) {
+    #if STRN == 0
+      int ng = ni + get_local_id(1)*(NWI/VWN);
+    #elif STRN == 1
+      int ng = get_local_id(1) + ni*NDIMC;
+    #endif
+    bpm[ni] = blm[kg*(NWG/VWN) + ng];
+  }
+}
+#endif
+
+// =================================================================================================
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
+                         const real alpha, const real beta) {
+  #pragma unroll
+  for (int ni=0; ni<NWI; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWI/VWM; ++mi) {
+      #if STRM == 0
+        int mg = mi + get_local_id(0)*(MWI/VWM);
+      #elif STRM == 1
+        int mg = get_local_id(0) + mi*MDIMC;
+      #endif
+      #if STRN == 0
+        int ng = ni + get_local_id(1)*NWI;
+      #elif STRN == 1
+        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
+      #endif
+      int idm = mg + get_group_id(0)*(MWG/VWM);
+      int idn = ng + get_group_id(1)*NWG;
+      int index = idn*(kSizeM/VWM) + idm;
+      #if VWM == 1
+        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cgm[index]);
+      #elif VWM == 2
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cgm[index].x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cgm[index].y);
+      #elif VWM == 4
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cgm[index].x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cgm[index].y);
+        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cgm[index].z);
+        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cgm[index].w);
+      #elif VWM == 8
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cgm[index].s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cgm[index].s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cgm[index].s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cgm[index].s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cgm[index].s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cgm[index].s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cgm[index].s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cgm[index].s7);
+      #elif VWM == 16
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cgm[index].s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cgm[index].s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cgm[index].s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cgm[index].s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cgm[index].s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cgm[index].s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cgm[index].s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cgm[index].s7);
+        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cgm[index].s8);
+        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cgm[index].s9);
+        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cgm[index].sA);
+        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cgm[index].sB);
+        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cgm[index].sC);
+        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cgm[index].sD);
+        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cgm[index].sE);
+        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cgm[index].sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
+// The vectorised multiply-add function
+inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
+  #if USE_VECTOR_MAD == 1
+    cvec += avec * bval;
+  #else
+    #if VWM == 1
+      MultiplyAdd(cvec,    avec,    bval);
+    #elif VWM == 2
+      MultiplyAdd(cvec.x , avec.x,  bval);
+      MultiplyAdd(cvec.y , avec.y,  bval);
+    #elif VWM == 4
+      MultiplyAdd(cvec.x , avec.x,  bval);
+      MultiplyAdd(cvec.y , avec.y,  bval);
+      MultiplyAdd(cvec.z , avec.z,  bval);
+      MultiplyAdd(cvec.w , avec.w,  bval);
+    #elif VWM == 8
+      MultiplyAdd(cvec.s0, avec.s0, bval);
+      MultiplyAdd(cvec.s1, avec.s1, bval);
+      MultiplyAdd(cvec.s2, avec.s2, bval);
+      MultiplyAdd(cvec.s3, avec.s3, bval);
+      MultiplyAdd(cvec.s4, avec.s4, bval);
+      MultiplyAdd(cvec.s5, avec.s5, bval);
+      MultiplyAdd(cvec.s6, avec.s6, bval);
+      MultiplyAdd(cvec.s7, avec.s7, bval);
+    #elif VWM == 16
+      MultiplyAdd(cvec.s0, avec.s0, bval);
+      MultiplyAdd(cvec.s1, avec.s1, bval);
+      MultiplyAdd(cvec.s2, avec.s2, bval);
+      MultiplyAdd(cvec.s3, avec.s3, bval);
+      MultiplyAdd(cvec.s4, avec.s4, bval);
+      MultiplyAdd(cvec.s5, avec.s5, bval);
+      MultiplyAdd(cvec.s6, avec.s6, bval);
+      MultiplyAdd(cvec.s7, avec.s7, bval);
+      MultiplyAdd(cvec.s8, avec.s8, bval);
+      MultiplyAdd(cvec.s9, avec.s9, bval);
+      MultiplyAdd(cvec.sA, avec.sA, bval);
+      MultiplyAdd(cvec.sB, avec.sB, bval);
+      MultiplyAdd(cvec.sC, avec.sC, bval);
+      MultiplyAdd(cvec.sD, avec.sD, bval);
+      MultiplyAdd(cvec.sE, avec.sE, bval);
+      MultiplyAdd(cvec.sF, avec.sF, bval);
+    #endif
+  #endif
+  return cvec;
+}
+
+// Performs the actual computation: Cpm += Apm * Bpm
+inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) {
+  #pragma unroll
+  for (int ni=0; ni<NWI/VWN; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWI/VWM; ++mi) {
+      #if VWN == 1
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni]);
+      #elif VWN == 2
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
+        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
+      #elif VWN == 4
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
+        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
+        cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].z);
+        cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].w);
+      #elif VWN == 8
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].s0);
+        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].s1);
+        cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].s2);
+        cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].s3);
+        cpm[ni*VWN + 4][mi] = MultiplyAddVector(cpm[ni*VWN + 4][mi], apm[mi], bpm[ni].s4);
+        cpm[ni*VWN + 5][mi] = MultiplyAddVector(cpm[ni*VWN + 5][mi], apm[mi], bpm[ni].s5);
+        cpm[ni*VWN + 6][mi] = MultiplyAddVector(cpm[ni*VWN + 6][mi], apm[mi], bpm[ni].s6);
+        cpm[ni*VWN + 7][mi] = MultiplyAddVector(cpm[ni*VWN + 7][mi], apm[mi], bpm[ni].s7);
+      #elif VWN == 16
+        cpm[ni*VWN + 0 ][mi] = MultiplyAddVector(cpm[ni*VWN + 0 ][mi], apm[mi], bpm[ni].s0);
+        cpm[ni*VWN + 1 ][mi] = MultiplyAddVector(cpm[ni*VWN + 1 ][mi], apm[mi], bpm[ni].s1);
+        cpm[ni*VWN + 2 ][mi] = MultiplyAddVector(cpm[ni*VWN + 2 ][mi], apm[mi], bpm[ni].s2);
+        cpm[ni*VWN + 3 ][mi] = MultiplyAddVector(cpm[ni*VWN + 3 ][mi], apm[mi], bpm[ni].s3);
+        cpm[ni*VWN + 4 ][mi] = MultiplyAddVector(cpm[ni*VWN + 4 ][mi], apm[mi], bpm[ni].s4);
+        cpm[ni*VWN + 5 ][mi] = MultiplyAddVector(cpm[ni*VWN + 5 ][mi], apm[mi], bpm[ni].s5);
+        cpm[ni*VWN + 6 ][mi] = MultiplyAddVector(cpm[ni*VWN + 6 ][mi], apm[mi], bpm[ni].s6);
+        cpm[ni*VWN + 7 ][mi] = MultiplyAddVector(cpm[ni*VWN + 7 ][mi], apm[mi], bpm[ni].s7);
+        cpm[ni*VWN + 8 ][mi] = MultiplyAddVector(cpm[ni*VWN + 8 ][mi], apm[mi], bpm[ni].s8);
+        cpm[ni*VWN + 9 ][mi] = MultiplyAddVector(cpm[ni*VWN + 9 ][mi], apm[mi], bpm[ni].s9);
+        cpm[ni*VWN + 10][mi] = MultiplyAddVector(cpm[ni*VWN + 10][mi], apm[mi], bpm[ni].sA);
+        cpm[ni*VWN + 11][mi] = MultiplyAddVector(cpm[ni*VWN + 11][mi], apm[mi], bpm[ni].sB);
+        cpm[ni*VWN + 12][mi] = MultiplyAddVector(cpm[ni*VWN + 12][mi], apm[mi], bpm[ni].sC);
+        cpm[ni*VWN + 13][mi] = MultiplyAddVector(cpm[ni*VWN + 13][mi], apm[mi], bpm[ni].sD);
+        cpm[ni*VWN + 14][mi] = MultiplyAddVector(cpm[ni*VWN + 14][mi], apm[mi], bpm[ni].sE);
+        cpm[ni*VWN + 15][mi] = MultiplyAddVector(cpm[ni*VWN + 15][mi], apm[mi], bpm[ni].sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
+// Main entry of the kernel. This function contains the basic skeleton, the functionality is
+// provided by the inlined functions above
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+                    const real alpha, const real beta,
+                    const __global realM* restrict agm,
+                    const __global realN* restrict bgm,
+                    __global realM* cgm) {
+
+  // Combined thread identifier
+  #if SA == 1 || SB == 1
+    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
+  #endif
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+  
+  // Allocates workitem-private memory (registers)
+  realM apm[MWI/VWM];
+  realN bpm[NWI/VWN];
+  realM cpm[NWI][MWI/VWM];
+
+  // Initializes the accumulation registers
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+    #pragma unroll
+    for (int ni=0; ni<NWI; ++ni) {
+      #if VWM == 1
+        SetToZero(cpm[ni][mi]);
+      #elif VWM == 2
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+      #elif VWM == 4
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+        SetToZero(cpm[ni][mi].z);
+        SetToZero(cpm[ni][mi].w);
+      #elif VWM == 8
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+      #elif VWM == 16
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+        SetToZero(cpm[ni][mi].s8);
+        SetToZero(cpm[ni][mi].s9);
+        SetToZero(cpm[ni][mi].sA);
+        SetToZero(cpm[ni][mi].sB);
+        SetToZero(cpm[ni][mi].sC);
+        SetToZero(cpm[ni][mi].sD);
+        SetToZero(cpm[ni][mi].sE);
+        SetToZero(cpm[ni][mi].sF);
+      #endif
+    }
+  }
+
+  // Loops over all workgroup tiles
+  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
+
+    // Loads data: off-chip --> local (matrix A)
+    #if SA == 1
+      GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
+    #endif
+    // Loads data: off-chip --> local (matrix B)
+    #if SB == 1
+      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
+    #endif
+
+    // Synchronizes all threads in a workgroup
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+
+    // Loops over all workitem tiles, unrolled by a factor KWI
+    for (int pwi=0; pwi<KWG; pwi+=KWI) {
+      #pragma unroll
+      for (int pit=0; pit<KWI; ++pit) {
+        #if SA == 0 || SB == 0
+          int idk = kwg + pwi + pit;
+        #endif
+        #if SA == 1 || SB == 1
+          int kg = pwi+pit;
+        #endif
+
+        // Loads data: local --> private (matrix A)
+        #if SA == 1
+          LocalToPrivateA(alm, apm, kg);
+        // Loads data: off-chip --> private (matrix A)
+        #else
+          GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+        #endif
+
+        // Loads data: local --> private (matrix B)
+        #if SB == 1
+          LocalToPrivateB(blm, bpm, kg);
+        // Loads data: off-chip --> private (matrix B)
+        #else
+          GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+        #endif
+
+        // Performs the accumulation (Cpm += Apm * Bpm)
+        MultiplyAccumulate(cpm, apm, bpm);
+      }
+    }
+
+    // Synchronizes all threads in a workgroup
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+  }
+
+  // Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeM, alpha, beta);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/routine.cc b/src/routine.cc
new file mode 100644
index 00000000..32face4a
--- /dev/null
+++ b/src/routine.cc
@@ -0,0 +1,326 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Routine base class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routine.h"
+
+#include "internal/utilities.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The cache of compiled OpenCL programs
+std::vector<Routine::ProgramCache> Routine::program_cache_;
+
+// Constructor: not much here, because no status codes can be returned
+Routine::Routine(CommandQueue &queue, Event &event,
+                 const std::vector<std::string> &routines, const Precision precision):
+    precision_(precision),
+    queue_(queue),
+    event_(event),
+    context_(queue_.GetContext()),
+    device_(queue_.GetDevice()),
+    device_name_(device_.Name()),
+    max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
+    max_work_item_sizes_(device_.MaxWorkItemSizes()),
+    max_work_group_size_(device_.MaxWorkGroupSize()),
+    db_(queue_, routines, precision_),
+    routines_(routines) {
+}
+
+// =================================================================================================
+
+// Separate set-up function to allow for status codes to be returned
+StatusCode Routine::SetUp(const std::string &routine_source) {
+
+  // Queries the cache to see whether or not the compiled kernel is already there. If not, it will
+  // be built and added to the cache.
+  if (!ProgramIsInCache()) {
+
+    // Inspects whether or not cl_khr_fp64 is supported in case of double precision
+    auto extensions = device_.Extensions();
+    if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
+      if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
+        return StatusCode::kNoDoublePrecision;
+      }
+    }
+
+    // As above, but for cl_khr_fp16 (half precision)
+    if (precision_ == Precision::kHalf) {
+      if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
+        return StatusCode::kNoHalfPrecision;
+      }
+    }
+
+    // Loads the common header (typedefs and defines and such)
+    std::string common_header =
+    #include "kernels/common.opencl"
+
+    // Collects the parameters for this device in the form of defines, and adds the precision
+    auto defines = db_.GetDefines();
+    defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
+    auto source_string = defines + common_header + routine_source;
+
+    // Compiles the kernel
+    try {
+      auto program = Program(context_, source_string);
+      auto options = std::string{};
+      auto status = program.Build(device_, options);
+
+      // Checks for compiler crashes/errors/warnings
+      if (status == CL_BUILD_PROGRAM_FAILURE) {
+        auto message = program.GetBuildInfo(device_);
+        fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
+        return StatusCode::kBuildProgramFailure;
+      }
+      if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; }
+
+      // Store the compiled program in the cache
+      program_cache_.push_back({program, device_name_, precision_, routines_});
+    } catch (...) { return StatusCode::kBuildProgramFailure; }
+  }
+
+  // No errors, normal termination of this function
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Enqueues a kernel, waits for completion, and checks for errors
+StatusCode Routine::RunKernel(const Kernel &kernel, std::vector<size_t> &global,
+                              const std::vector<size_t> &local) {
+
+  // Tests for validity of the local thread sizes
+  if (local.size() > max_work_item_dimensions_) {
+    return StatusCode::kInvalidLocalNumDimensions; 
+  }
+  for (auto i=size_t{0}; i<local.size(); ++i) {
+    if (local[i] > max_work_item_sizes_[i]) { return StatusCode::kInvalidLocalThreadsDim; }
+  }
+  auto local_size = size_t{1};
+  for (auto &item: local) { local_size *= item; }
+  if (local_size > max_work_group_size_) { return StatusCode::kInvalidLocalThreadsTotal; }
+
+  // Make sure the global thread sizes are at least equal to the local sizes
+  for (auto i=size_t{0}; i<global.size(); ++i) {
+    if (global[i] < local[i]) { global[i] = local[i]; }
+  }
+
+  // Tests for local memory usage
+  auto local_mem_usage = kernel.LocalMemUsage(device_);
+  if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
+
+  // Launches the kernel (and checks for launch errors)
+  auto status = queue_.EnqueueKernel(kernel, global, local, event_);
+  if (status != CL_SUCCESS) { return StatusCode::kKernelLaunchError; }
+
+  // Waits for completion of the kernel
+  status = event_.Wait();
+  if (status != CL_SUCCESS) { return StatusCode::kKernelRunError; }
+
+  // No errors, normal termination of this function
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
+// sufficient buffer size.
+StatusCode Routine::TestMatrixA(const size_t one, const size_t two, const Buffer &buffer,
+                                const size_t offset, const size_t ld, const size_t data_size) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimA; }
+  try {
+    auto required_size = (ld*two + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
+  } catch (...) { return StatusCode::kInvalidMatrixA; }
+  return StatusCode::kSuccess;
+}
+
+// Tests matrix B for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
+// sufficient buffer size.
+StatusCode Routine::TestMatrixB(const size_t one, const size_t two, const Buffer &buffer,
+                                const size_t offset, const size_t ld, const size_t data_size) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimB; }
+  try {
+    auto required_size = (ld*two + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; }
+  } catch (...) { return StatusCode::kInvalidMatrixB; }
+  return StatusCode::kSuccess;
+}
+
+// Tests matrix C for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
+// sufficient buffer size.
+StatusCode Routine::TestMatrixC(const size_t one, const size_t two, const Buffer &buffer,
+                                const size_t offset, const size_t ld, const size_t data_size) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimC; }
+  try {
+    auto required_size = (ld*two + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; }
+  } catch (...) { return StatusCode::kInvalidMatrixC; }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
+// sufficient buffer size.
+StatusCode Routine::TestVectorX(const size_t n, const Buffer &buffer, const size_t offset,
+                                const size_t inc, const size_t data_size) {
+  if (inc == 0) { return StatusCode::kInvalidIncrementX; }
+  try {
+    auto required_size = (n*inc + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; }
+  } catch (...) { return StatusCode::kInvalidVectorX; }
+  return StatusCode::kSuccess;
+}
+
+// Tests vector Y for validity: checks for a valid increment, a valid OpenCL buffer, and for a
+// sufficient buffer size.
+StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
+                                const size_t inc, const size_t data_size) {
+  if (inc == 0) { return StatusCode::kInvalidIncrementY; }
+  try {
+    auto required_size = (n*inc + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; }
+  } catch (...) { return StatusCode::kInvalidVectorY; }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Copies a matrix and pads it with zeros
+StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
+                                           const size_t src_ld, const size_t src_offset,
+                                           const Buffer &src,
+                                           const size_t dest_one, const size_t dest_two,
+                                           const size_t dest_ld, const size_t dest_offset,
+                                           const Buffer &dest,
+                                           const bool do_transpose, const bool pad,
+                                           const Program &program) {
+
+  // Determines whether or not the fast-version could potentially be used
+  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) &&
+                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
+
+  // Determines the right kernel
+  auto kernel_name = std::string{};
+  if (do_transpose) {
+    if (use_fast_kernel &&
+        IsMultiple(src_ld, db_["TRA_WPT"]) &&
+        IsMultiple(src_one, db_["TRA_WPT"]*db_["TRA_WPT"]) &&
+        IsMultiple(src_two, db_["TRA_WPT"]*db_["TRA_WPT"])) {
+      kernel_name = "TransposeMatrix";
+    }
+    else {
+      use_fast_kernel = false;
+      kernel_name = (pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
+    }
+  }
+  else {
+    if (use_fast_kernel &&
+        IsMultiple(src_ld, db_["COPY_VW"]) &&
+        IsMultiple(src_one, db_["COPY_VW"]*db_["COPY_DIMX"]) &&
+        IsMultiple(src_two, db_["COPY_WPT"]*db_["COPY_DIMY"])) {
+      kernel_name = "CopyMatrix";
+    }
+    else {
+      use_fast_kernel = false;
+      kernel_name = (pad) ? "PadMatrix" : "UnPadMatrix";
+    }
+  }
+
+  // Retrieves the kernel from the compiled binary
+  try {
+    auto kernel = Kernel(program, kernel_name);
+
+    // Sets the kernel arguments
+    if (use_fast_kernel) {
+      kernel.SetArgument(0, static_cast<int>(src_ld));
+      kernel.SetArgument(1, src());
+      kernel.SetArgument(2, dest());
+    }
+    else {
+      kernel.SetArgument(0, static_cast<int>(src_one));
+      kernel.SetArgument(1, static_cast<int>(src_two));
+      kernel.SetArgument(2, static_cast<int>(src_ld));
+      kernel.SetArgument(3, static_cast<int>(src_offset));
+      kernel.SetArgument(4, src());
+      kernel.SetArgument(5, static_cast<int>(dest_one));
+      kernel.SetArgument(6, static_cast<int>(dest_two));
+      kernel.SetArgument(7, static_cast<int>(dest_ld));
+      kernel.SetArgument(8, static_cast<int>(dest_offset));
+      kernel.SetArgument(9, dest());
+    }
+
+    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+    // parameters in the database.
+    auto status = StatusCode::kSuccess;
+    if (do_transpose) {
+      if (use_fast_kernel) {
+        auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"],
+                                          dest_two / db_["TRA_WPT"]};
+        auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
+        status = RunKernel(kernel, global, local);
+      }
+      else {
+        auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
+                                          Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])};
+        auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
+        status = RunKernel(kernel, global, local);
+      }
+    }
+    else {
+      if (use_fast_kernel) {
+        auto global = std::vector<size_t>{dest_one / db_["COPY_VW"],
+                                          dest_two / db_["COPY_WPT"]};
+        auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
+        status = RunKernel(kernel, global, local);
+      }
+      else {
+        auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                          Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+        auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+        status = RunKernel(kernel, global, local);
+      }
+    }
+    return status;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
+// otherwise.
+Program Routine::GetProgramFromCache() const {
+  for (auto &cached_program: program_cache_) {
+    if (cached_program.MatchInCache(device_name_, precision_, routines_)) {
+      return cached_program.program;
+    }
+  }
+  throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
+}
+
+// Queries the cache to see whether or not the compiled kernel is already there
+bool Routine::ProgramIsInCache() const {
+  for (auto &cached_program: program_cache_) {
+    if (cached_program.MatchInCache(device_name_, precision_, routines_)) { return true; }
+  }
+  return false;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/xaxpy.cc b/src/routines/xaxpy.cc
new file mode 100644
index 00000000..309ae3ce
--- /dev/null
+++ b/src/routines/xaxpy.cc
@@ -0,0 +1,115 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/xaxpy.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
+template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
+template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Xaxpy"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
+                            const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { return StatusCode::kInvalidDimension; }
+
+  // Tests the vectors for validity
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines whether or not the fast-version can be used
+  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
+                         (y_offset == 0) && (y_inc == 1) &&
+                         IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+
+  // If possible, run the fast-version of the kernel
+  auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
+
+  // Retrieves the Xaxpy kernel from the compiled binary
+  try {
+    auto program = GetProgramFromCache();
+    auto kernel = Kernel(program, kernel_name);
+
+    // Sets the kernel arguments
+    if (use_fast_kernel) {
+      kernel.SetArgument(0, static_cast<int>(n));
+      kernel.SetArgument(1, alpha);
+      kernel.SetArgument(2, x_buffer());
+      kernel.SetArgument(3, y_buffer());
+    }
+    else {
+      kernel.SetArgument(0, static_cast<int>(n));
+      kernel.SetArgument(1, alpha);
+      kernel.SetArgument(2, x_buffer());
+      kernel.SetArgument(3, static_cast<int>(x_offset));
+      kernel.SetArgument(4, static_cast<int>(x_inc));
+      kernel.SetArgument(5, y_buffer());
+      kernel.SetArgument(6, static_cast<int>(y_offset));
+      kernel.SetArgument(7, static_cast<int>(y_inc));
+    }
+
+    // Launches the kernel
+    if (use_fast_kernel) {
+      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+      auto local = std::vector<size_t>{db_["WGS"]};
+      status = RunKernel(kernel, global, local);
+    }
+    else {
+      auto n_ceiled = Ceil(n, db_["WGS"]);
+      auto global = std::vector<size_t>{CeilDiv(n_ceiled, db_["WPT"])};
+      auto local = std::vector<size_t>{db_["WGS"]};
+      status = RunKernel(kernel, global, local);
+    }
+    if (ErrorIn(status)) { return status; }
+
+    // Waits for all kernels to finish
+    queue_.Finish();
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xaxpy<float>;
+template class Xaxpy<double>;
+template class Xaxpy<float2>;
+template class Xaxpy<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc
new file mode 100644
index 00000000..16bbc154
--- /dev/null
+++ b/src/routines/xgemm.cc
@@ -0,0 +1,168 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/xgemm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
+template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
+template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgemm<T>::DoGemm(const Layout layout,
+                            const Transpose a_transpose, const Transpose b_transpose,
+                            const size_t m, const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed. Note
+  // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of
+  // col-major) to be transformed, so transposing requirements are not the same as whether or not
+  // the matrix is actually transposed in memory.
+  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
+  auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+  auto a_do_transpose =  a_rotated;
+  auto b_do_transpose = !b_rotated;
+  auto c_do_transpose =  c_rotated;
+
+  // Computes the first and second dimensions of the 3 matrices taking into account whether the
+  // matrices are rotated or not
+  auto a_one = (a_rotated) ? k : m;
+  auto a_two = (a_rotated) ? m : k;
+  auto b_one = (b_rotated) ? n : k;
+  auto b_two = (b_rotated) ? k : n;
+  auto c_one = (c_rotated) ? n : m;
+  auto c_two = (c_rotated) ? m : n;
+
+  // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than K when rotated, or less than M when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N when rotated, or less than M when not-rotated
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of m, n, and k
+  auto m_ceiled = Ceil(m, db_["MWG"]);
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
+    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto program = GetProgramFromCache();
+
+    // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
+    // them up until they reach a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                    m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
+                                    a_do_transpose, true, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
+                                    b_do_transpose, true, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Only necessary for matrix C if it used both as input and output
+    if (beta != static_cast<T>(0)) {
+      status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
+                                      m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
+                                      c_do_transpose, true, program);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Retrieves the Xgemm kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, "Xgemm");
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(m_ceiled));
+      kernel.SetArgument(1, static_cast<int>(n_ceiled));
+      kernel.SetArgument(2, static_cast<int>(k_ceiled));
+      kernel.SetArgument(3, alpha);
+      kernel.SetArgument(4, beta);
+      kernel.SetArgument(5, temp_a());
+      kernel.SetArgument(6, temp_b());
+      kernel.SetArgument(7, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (m_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
+                                      c_one, c_two, c_ld, c_offset, c_buffer,
+                                      c_do_transpose, false, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgemm<float>;
+template class Xgemm<double>;
+template class Xgemm<float2>;
+template class Xgemm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/xsymm.cc b/src/routines/xsymm.cc
new file mode 100644
index 00000000..aa43593d
--- /dev/null
+++ b/src/routines/xsymm.cc
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/xsymm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsymm<T>::Xsymm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
+  // left) or B (on the right) in the Xgemm routine.
+  size_t k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the squared A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the symmetrix matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
+
+  // Temporary buffer for a copy of the symmetric matrix
+  try {
+    auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the symmetric-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_symm());
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // symmetry-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "C := AB+C" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_symm, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+      }
+
+      // ... with "C := BA+C". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_symm, 0, k,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsymm<float>;
+template class Xsymm<double>;
+template class Xsymm<float2>;
+template class Xsymm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc
new file mode 100644
index 00000000..da223bf0
--- /dev/null
+++ b/src/tuning/copy.cc
@@ -0,0 +1,83 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the copy OpenCL kernels. It uses CLTune.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The copy auto-tuner
+template <typename T>
+void CopyTune(const Arguments<T> &args,
+              const std::vector<T> &a_mat, std::vector<T> &b_mat,
+              cltune::Tuner &tuner) {
+
+  // This points to the CopyMatrix kernel as found in the CLBlast library. This is just one example
+  // of a copy kernel. However, all copy-kernels use the same tuning parameters, so one has to be
+  // chosen as a representative.
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/copy.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "CopyMatrix", {args.m, args.n}, {1, 1});
+  tuner.SetReferenceFromString(sources, "CopyMatrix", {args.m, args.n}, {8, 8});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32});
+  tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32});
+  tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8});
+  tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Modifies the thread-sizes (both global and local) based on the parameters
+  tuner.MulLocalSize(id, {"COPY_DIMX", "COPY_DIMY"});
+  tuner.DivGlobalSize(id, {"COPY_VW", "COPY_WPT"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentInput(a_mat);
+  tuner.AddArgumentOutput(b_mat);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerCopy(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerAB<float>(argc, argv, CopyTune<float>); break;
+    case Precision::kDouble: TunerAB<double>(argc, argv, CopyTune<double>); break;
+    case Precision::kComplexSingle: TunerAB<float2>(argc, argv, CopyTune<float2>); break;
+    case Precision::kComplexDouble: TunerAB<double2>(argc, argv, CopyTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerCopy(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc
new file mode 100644
index 00000000..93312df2
--- /dev/null
+++ b/src/tuning/pad.cc
@@ -0,0 +1,90 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the pad-copy OpenCL kernels. It uses CLTune.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The pad auto-tuner
+template <typename T>
+void PadTune(const Arguments<T> &args,
+              const std::vector<T> &a_mat, std::vector<T> &b_mat,
+              cltune::Tuner &tuner) {
+
+  // This points to the PadMatrix kernel as found in the CLBlast library. This is just one
+  // example of a pad kernel. However, all pad-kernels use the same tuning parameters, so one has
+  // to be chosen as a representative.
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/pad.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "PadMatrix", {args.m, args.n}, {1, 1});
+  tuner.SetReferenceFromString(sources, "PadMatrix", {args.m, args.n}, {8, 8});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32});
+  tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32});
+  tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4});
+  tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Modifies the thread-sizes (both global and local) based on the parameters
+  tuner.MulLocalSize(id, {"PAD_DIMX", "PAD_DIMY"});
+  tuner.DivGlobalSize(id, {"PAD_WPTX", "PAD_WPTY"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(0);
+  tuner.AddArgumentInput(a_mat);
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(0);
+  tuner.AddArgumentOutput(b_mat);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerPad(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerAB<float>(argc, argv, PadTune<float>); break;
+    case Precision::kDouble: TunerAB<double>(argc, argv, PadTune<double>); break;
+    case Precision::kComplexSingle: TunerAB<float2>(argc, argv, PadTune<float2>); break;
+    case Precision::kComplexDouble: TunerAB<double2>(argc, argv, PadTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerPad(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc
new file mode 100644
index 00000000..b2af9925
--- /dev/null
+++ b/src/tuning/padtranspose.cc
@@ -0,0 +1,95 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the pad-transpose OpenCL kernels. It uses CLTune.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The transpose auto-tuner
+template <typename T>
+void PadTransposeTune(const Arguments<T> &args,
+                      const std::vector<T> &a_mat, std::vector<T> &b_mat,
+                      cltune::Tuner &tuner) {
+
+  // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
+  // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
+  // to be chosen as a representative.
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/padtranspose.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {1, 1});
+  tuner.SetReferenceFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {8, 8});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64});
+  tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16});
+  tuner.AddParameter(id, "PADTRA_PAD", {0, 1});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Sets the constraints for local memory size limitations
+  auto LocalMemorySize = [args] (std::vector<size_t> v) {
+    return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
+  };
+  tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"});
+
+  // Modifies the thread-sizes (both global and local) based on the parameters
+  tuner.DivGlobalSize(id, {"PADTRA_WPT", "PADTRA_WPT"});
+  tuner.MulLocalSize(id, {"PADTRA_TILE", "PADTRA_TILE"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(0);
+  tuner.AddArgumentInput(a_mat);
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(0);
+  tuner.AddArgumentOutput(b_mat);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerPadTranspose(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerAB<float>(argc, argv, PadTransposeTune<float>); break;
+    case Precision::kDouble: TunerAB<double>(argc, argv, PadTransposeTune<double>); break;
+    case Precision::kComplexSingle: TunerAB<float2>(argc, argv, PadTransposeTune<float2>); break;
+    case Precision::kComplexDouble: TunerAB<double2>(argc, argv, PadTransposeTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerPadTranspose(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc
new file mode 100644
index 00000000..90392866
--- /dev/null
+++ b/src/tuning/transpose.cc
@@ -0,0 +1,88 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the transpose OpenCL kernels. It uses CLTune.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The transpose auto-tuner
+template <typename T>
+void TransposeTune(const Arguments<T> &args,
+                   const std::vector<T> &a_mat, std::vector<T> &b_mat,
+                   cltune::Tuner &tuner) {
+
+  // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
+  // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
+  // to be chosen as a representative.
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/transpose.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "TransposeMatrix", {args.m, args.n}, {1, 1});
+  tuner.SetReferenceFromString(sources, "TransposeMatrix", {args.m, args.n}, {8, 8});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64});
+  tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16});
+  tuner.AddParameter(id, "TRA_PAD", {0, 1});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Sets the constraints for local memory size limitations
+  auto LocalMemorySize = [args] (std::vector<size_t> v) {
+    return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
+  };
+  tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"});
+
+  // Modifies the thread-sizes (both global and local) based on the parameters
+  tuner.DivGlobalSize(id, {"TRA_WPT", "TRA_WPT"});
+  tuner.MulLocalSize(id, {"TRA_DIM", "TRA_DIM"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentInput(a_mat);
+  tuner.AddArgumentOutput(b_mat);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerTranspose(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerAB<float>(argc, argv, TransposeTune<float>); break;
+    case Precision::kDouble: TunerAB<double>(argc, argv, TransposeTune<double>); break;
+    case Precision::kComplexSingle: TunerAB<float2>(argc, argv, TransposeTune<float2>); break;
+    case Precision::kComplexDouble: TunerAB<double2>(argc, argv, TransposeTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerTranspose(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc
new file mode 100644
index 00000000..bb93c053
--- /dev/null
+++ b/src/tuning/tuning.cc
@@ -0,0 +1,186 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the common auto-tuning code to interface with the CLTune library.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for vector-vector routines.
+template <typename T>
+void TunerXY(int argc, char* argv[], const Tuner2<T> &tune_function) {
+
+  // Sets the parameters and platform/device for which to tune (command-line options)
+  auto help = std::string{"* Options given/available:\n"};
+  auto args = Arguments<T>{};
+  args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+  args.device_id   = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+  args.precision   = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+  args.n           = GetArgument(argc, argv, help, kArgN, size_t{4096*1024});
+  args.alpha       = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>());
+  fprintf(stdout, "%s\n", help.c_str());
+
+  // Creates input buffers with random data
+  auto x_vec = std::vector<T>(args.n);
+  auto y_vec = std::vector<T>(args.n);
+  PopulateVector(x_vec);
+  PopulateVector(y_vec);
+
+  // Initializes the tuner for the chosen device
+  cltune::Tuner tuner(args.platform_id, args.device_id);
+
+  // Use full-search to explore all parameter combinations.
+  tuner.UseFullSearch();
+
+  // Configures the tuning parameters (kernel specific)
+  tune_function(args, x_vec, y_vec, tuner);
+
+  // Starts the tuning process
+  tuner.Tune();
+
+  // Prints the results to screen
+  auto time_ms = tuner.PrintToScreen();
+  tuner.PrintFormatted();
+
+  // Also prints the performance of the best-case in terms of GB/s
+  const auto mega_bytes = (3*args.n*GetBytes(args.precision)) * 1.0e-6;
+  if (time_ms != 0.0) {
+    printf("[ -------> ] %.1lf ms or %.1lf GB/s\n", time_ms, mega_bytes/time_ms);
+  }
+}
+
+// Compiles the above function
+template void TunerXY<float>(int, char**, const Tuner2<float>&);
+template void TunerXY<double>(int, char**, const Tuner2<double>&);
+template void TunerXY<float2>(int, char**, const Tuner2<float2>&);
+template void TunerXY<double2>(int, char**, const Tuner2<double2>&);
+
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for matrix-matrix routines.
+template <typename T>
+void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function) {
+
+  // Sets the parameters and platform/device for which to tune (command-line options)
+  auto help = std::string{"* Options given/available:\n"};
+  auto args = Arguments<T>{};
+  args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+  args.device_id   = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+  args.precision   = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+  args.m           = GetArgument(argc, argv, help, kArgM, size_t{1024});
+  args.n           = GetArgument(argc, argv, help, kArgN, size_t{1024});
+  args.fraction    = GetArgument(argc, argv, help, kArgFraction, 2048.0);
+  fprintf(stdout, "%s\n", help.c_str());
+
+  // Creates input buffers with random data
+  auto a_mat = std::vector<T>(args.m * args.n);
+  auto b_mat = std::vector<T>(args.m * args.n);
+  PopulateVector(a_mat);
+  PopulateVector(b_mat);
+
+  // Initializes the tuner for the chosen device
+  cltune::Tuner tuner(args.platform_id, args.device_id);
+
+  // Use full-search to explore all parameter combinations.
+  tuner.UseFullSearch();
+
+  // Configures the tuning parameters (kernel specific)
+  tune_function(args, a_mat, b_mat, tuner);
+
+  // Starts the tuning process
+  tuner.Tune();
+
+  // Prints the results to screen
+  auto time_ms = tuner.PrintToScreen();
+  tuner.PrintFormatted();
+
+  // Also prints the performance of the best-case in terms of GB/s
+  const auto mega_bytes = (2*args.m*args.n*GetBytes(args.precision)) * 1.0e-6;
+  if (time_ms != 0.0) {
+    printf("[ -------> ] %.1lf ms or %.1lf GB/s\n", time_ms, mega_bytes/time_ms);
+  }
+}
+
+// Compiles the above function
+template void TunerAB<float>(int, char**, const Tuner2<float>&);
+template void TunerAB<double>(int, char**, const Tuner2<double>&);
+template void TunerAB<float2>(int, char**, const Tuner2<float2>&);
+template void TunerAB<double2>(int, char**, const Tuner2<double2>&);
+
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for matrix-matrix-matrix routines.
+template <typename T>
+void TunerABC(int argc, char* argv[], const Tuner3<T> &tune_function) {
+
+  // Sets the parameters and platform/device for which to tune (command-line options)
+  auto help = std::string{"* Options given/available:\n"};
+  auto args = Arguments<T>{};
+  args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+  args.device_id   = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+  args.precision   = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+  args.m           = GetArgument(argc, argv, help, kArgM, size_t{1024});
+  args.n           = GetArgument(argc, argv, help, kArgN, size_t{1024});
+  args.k           = GetArgument(argc, argv, help, kArgK, size_t{1024});
+  args.alpha       = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>());
+  args.beta        = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>());
+  args.fraction    = GetArgument(argc, argv, help, kArgFraction, 2048.0);
+  fprintf(stdout, "%s\n", help.c_str());
+
+  // Creates input buffers with random data
+  auto a_mat = std::vector<T>(args.m * args.k);
+  auto b_mat = std::vector<T>(args.n * args.k);
+  auto c_mat = std::vector<T>(args.m * args.n);
+  PopulateVector(a_mat);
+  PopulateVector(b_mat);
+  PopulateVector(c_mat);
+
+  // Initializes the tuner for the chosen device
+  cltune::Tuner tuner(args.platform_id, args.device_id);
+
+  // Use random-search to search only a part of the parameter values. The fraction of the search-
+  // space to explore is set as a command-line argument.
+  tuner.UseRandomSearch(1.0/args.fraction);
+
+  // Configures the tuning parameters (kernel specific)
+  tune_function(args, a_mat, b_mat, c_mat, tuner);
+
+  // Starts the tuning process
+  tuner.Tune();
+
+  // Prints the results to screen
+  auto time_ms = tuner.PrintToScreen();
+  tuner.PrintFormatted();
+
+  // Also prints the performance of the best-case in terms of GFLOPS
+  const auto mega_flops = (2*args.m*args.n*args.k) * 1.0e-6;
+  if (time_ms != 0.0) {
+    printf("[ -------> ] %.1lf ms or %.1lf GFLOPS\n", time_ms, mega_flops/time_ms);
+  }
+}
+
+// Compiles the above function
+template void TunerABC<float>(int, char**, const Tuner3<float>&);
+template void TunerABC<double>(int, char**, const Tuner3<double>&);
+template void TunerABC<float2>(int, char**, const Tuner3<float2>&);
+template void TunerABC<double2>(int, char**, const Tuner3<double2>&);
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/xaxpy.cc b/src/tuning/xaxpy.cc
new file mode 100644
index 00000000..0439ed05
--- /dev/null
+++ b/src/tuning/xaxpy.cc
@@ -0,0 +1,88 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the Xaxpy OpenCL kernel. It uses the CLTune library.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The Xaxpy auto-tuner
+template <typename T>
+void XaxpyTune(const Arguments<T> &args,
+               const std::vector<T> &x_vec, std::vector<T> &y_vec,
+               cltune::Tuner &tuner) {
+
+  // The XaxpyFast kernel only works under certain conditions. Check here whether the condition is
+  // true for the reference kernel
+  if (!IsMultiple(args.n, 64)) {
+    throw std::runtime_error("The 'XaxpyFast' kernel requires 'n' to be a multiple of WGS*WPT*VW");
+  }
+
+  // This points to the XaxpyFast kernel as found in the CLBlast library
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/xaxpy.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "XaxpyFast", {args.n}, {1});
+  tuner.SetReferenceFromString(sources, "XaxpyFast", {args.n}, {64});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 2048});
+  tuner.AddParameter(id, "WPT", {1, 2, 4, 8});
+  tuner.AddParameter(id, "VW", {1, 2, 4, 8});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Modifies the thread-sizes (local) based on the parameters
+  tuner.MulLocalSize(id, {"WGS"});
+  tuner.DivGlobalSize(id, {"WPT"});
+  tuner.DivGlobalSize(id, {"VW"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(args.alpha);
+  tuner.AddArgumentInput(x_vec);
+  tuner.AddArgumentOutput(y_vec);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerXaxpy(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerXY<float>(argc, argv, XaxpyTune<float>); break;
+    case Precision::kDouble: TunerXY<double>(argc, argv, XaxpyTune<double>); break;
+    case Precision::kComplexSingle: TunerXY<float2>(argc, argv, XaxpyTune<float2>); break;
+    case Precision::kComplexDouble: TunerXY<double2>(argc, argv, XaxpyTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerXaxpy(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/xgemm.cc b/src/tuning/xgemm.cc
new file mode 100644
index 00000000..aba56810
--- /dev/null
+++ b/src/tuning/xgemm.cc
@@ -0,0 +1,126 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the Xgemm OpenCL kernel. It uses the CLTune library.
+// Note that this tuner uses random-search: running it multiple times or with a larger fraction
+// argument might be neccessary to obtain good results.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The Xgemm auto-tuner
+template <typename T>
+void XgemmTune(const Arguments<T> &args,
+               const std::vector<T> &a_mat, const std::vector<T> &b_mat, std::vector<T> &c_mat,
+               cltune::Tuner &tuner) {
+
+  // This points to the Xgemm kernel as found in the CLBlast library and its golden reference
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/xgemm.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "Xgemm", {args.m, args.n}, {1, 1});
+  tuner.SetReferenceFromString(sources, "Xgemm", {args.m, args.n}, {8, 8});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "MWG", {16, 32, 64, 128});
+  tuner.AddParameter(id, "NWG", {16, 32, 64, 128});
+  tuner.AddParameter(id, "KWG", {16, 32});
+  tuner.AddParameter(id, "MDIMC", {8, 16, 32});
+  tuner.AddParameter(id, "NDIMC", {8, 16, 32});
+  tuner.AddParameter(id, "MDIMA", {8, 16, 32});
+  tuner.AddParameter(id, "NDIMB", {8, 16, 32});
+  tuner.AddParameter(id, "KWI", {2, 8});
+  tuner.AddParameter(id, "VWM", {1, 2, 4, 8});
+  tuner.AddParameter(id, "VWN", {1, 2, 4, 8});
+  tuner.AddParameter(id, "STRM", {0, 1});
+  tuner.AddParameter(id, "STRN", {0, 1});
+  tuner.AddParameter(id, "SA", {0, 1});
+  tuner.AddParameter(id, "SB", {0, 1});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Sets the helper functions to implement the constraints below
+  auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
+  auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
+  auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
+
+  // Sets constraints: Requirement for unrolling the KWG loop
+  tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"});
+
+  // Sets constraints: Required for integer MWI and NWI
+  tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"});
+  tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"});
+
+  // Sets constraints: Required for integer MWIA and NWIB
+  tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"});
+  tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"});
+
+  // Sets constraints: KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...)
+  tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"});
+  tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"});
+
+  // Sets the constraints for local memory size limitations
+  auto LocalMemorySize = [args] (std::vector<size_t> v) {
+    return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision));
+  };
+  tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM",
+                                                  "SB", "KWG", "NWG", "VWN"});
+
+  // Modifies the thread-sizes (both global and local) based on the parameters
+  tuner.MulLocalSize(id, {"MDIMC", "NDIMC"});
+  tuner.MulGlobalSize(id, {"MDIMC", "NDIMC"});
+  tuner.DivGlobalSize(id, {"MWG", "NWG"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(static_cast<int>(args.k));
+  tuner.AddArgumentScalar(args.alpha);
+  tuner.AddArgumentScalar(args.beta);
+  tuner.AddArgumentInput(a_mat);
+  tuner.AddArgumentInput(b_mat);
+  tuner.AddArgumentOutput(c_mat);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerXgemm(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerABC<float>(argc, argv, XgemmTune<float>); break;
+    case Precision::kDouble: TunerABC<double>(argc, argv, XgemmTune<double>); break;
+    case Precision::kComplexSingle: TunerABC<float2>(argc, argv, XgemmTune<float2>); break;
+    case Precision::kComplexDouble: TunerABC<double2>(argc, argv, XgemmTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerXgemm(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/utilities.cc b/src/utilities.cc
new file mode 100644
index 00000000..80cea852
--- /dev/null
+++ b/src/utilities.cc
@@ -0,0 +1,255 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the common (test) utility functions.
+//
+// =================================================================================================
+
+#include "internal/utilities.h"
+
+#include <string>
+#include <vector>
+#include <chrono>
+#include <random>
+#include <iomanip>
+
+namespace clblast {
+// =================================================================================================
+
+// Implements the string conversion using std::to_string if possible
+template <typename T>
+std::string ToString(T value) {
+  return std::to_string(value);
+}
+template std::string ToString<int>(int value);
+template std::string ToString<size_t>(size_t value);
+template std::string ToString<float>(float value);
+template std::string ToString<double>(double value);
+
+// If not possible directly: special cases for complex data-types
+template <>
+std::string ToString(float2 value) {
+  std::ostringstream real, imag;
+  real << std::setprecision(2) << value.real();
+  imag << std::setprecision(2) << value.imag();
+  return real.str()+"+"+imag.str()+"i";
+}
+template <>
+std::string ToString(double2 value) {
+  std::ostringstream real, imag;
+  real << std::setprecision(2) << value.real();
+  imag << std::setprecision(2) << value.imag();
+  return real.str()+"+"+imag.str()+"i";
+}
+
+// If not possible directly: special cases for CLBlast data-types
+template <>
+std::string ToString(Layout value) {
+  switch(value) {
+    case Layout::kRowMajor: return ToString(static_cast<int>(value))+" (row-major)";
+    case Layout::kColMajor: return ToString(static_cast<int>(value))+" (col-major)";
+  }
+}
+template <>
+std::string ToString(Transpose value) {
+  switch(value) {
+    case Transpose::kNo: return ToString(static_cast<int>(value))+" (regular)";
+    case Transpose::kYes: return ToString(static_cast<int>(value))+" (transposed)";
+    case Transpose::kConjugate: return ToString(static_cast<int>(value))+" (conjugate)";
+  }
+}
+template <>
+std::string ToString(Side value) {
+  switch(value) {
+    case Side::kLeft: return ToString(static_cast<int>(value))+" (left)";
+    case Side::kRight: return ToString(static_cast<int>(value))+" (right)";
+  }
+}
+template <>
+std::string ToString(Triangle value) {
+  switch(value) {
+    case Triangle::kUpper: return ToString(static_cast<int>(value))+" (upper)";
+    case Triangle::kLower: return ToString(static_cast<int>(value))+" (lower)";
+  }
+}
+template <>
+std::string ToString(Precision value) {
+  switch(value) {
+    case Precision::kHalf: return ToString(static_cast<int>(value))+" (half)";
+    case Precision::kSingle: return ToString(static_cast<int>(value))+" (single)";
+    case Precision::kDouble: return ToString(static_cast<int>(value))+" (double)";
+    case Precision::kComplexSingle: return ToString(static_cast<int>(value))+" (complex-single)";
+    case Precision::kComplexDouble: return ToString(static_cast<int>(value))+" (complex-double)";
+  }
+}
+
+// =================================================================================================
+
+// Helper for the below function to convert the argument to the value type. Adds specialization for
+// complex data-types. Note that complex arguments are accepted as regular values and are copied to
+// both the real and imaginary parts.
+template <typename T>
+T ConvertArgument(const char* value) {
+  return static_cast<T>(std::stod(value));
+}
+template <> float2 ConvertArgument(const char* value) {
+  auto val = static_cast<float>(std::stod(value));
+  return float2{val, val};
+}
+template <> double2 ConvertArgument(const char* value) {
+  auto val = static_cast<double>(std::stod(value));
+  return double2{val, val};
+}
+
+// This function matches patterns in the form of "-option value" or "--option value". It returns a
+// default value in case the option is not found in the argument string.
+template <typename T>
+T GetArgument(const int argc, char *argv[], std::string &help,
+              const std::string &option, const T default_value) {
+
+  // Parses the argument. Note that this supports both the given option (e.g. -device) and one with
+  // an extra dash in front (e.g. --device).
+  auto return_value = static_cast<T>(default_value);
+  for (int c=0; c<argc; ++c) {
+    auto item = std::string{argv[c]};
+    if (item.compare("-"+option) == 0 || item.compare("--"+option) == 0) {
+      ++c;
+      return_value = ConvertArgument<T>(argv[c]);
+      break;
+    }
+  }
+
+  // Updates the help message and returns
+  help += "    -"+option+" "+ToString(return_value)+" ";
+  help += (return_value == default_value) ? "[=default]\n" : "\n";
+  return return_value;
+}
+
+// Compiles the above function
+template bool GetArgument<bool>(const int, char **, std::string&, const std::string&, const bool);
+template int GetArgument<int>(const int, char **, std::string&, const std::string&, const int);
+template size_t GetArgument<size_t>(const int, char **, std::string&, const std::string&, const size_t);
+template float GetArgument<float>(const int, char **, std::string&, const std::string&, const float);
+template double GetArgument<double>(const int, char **, std::string&, const std::string&, const double);
+template float2 GetArgument<float2>(const int, char **, std::string&, const std::string&, const float2);
+template double2 GetArgument<double2>(const int, char **, std::string&, const std::string&, const double2);
+template Layout GetArgument<Layout>(const int, char **, std::string&, const std::string&, const Layout);
+template Transpose GetArgument<Transpose>(const int, char **, std::string&, const std::string&, const Transpose);
+template Side GetArgument<Side>(const int, char **, std::string&, const std::string&, const Side);
+template Triangle GetArgument<Triangle>(const int, char **, std::string&, const std::string&, const Triangle);
+template Precision GetArgument<Precision>(const int, char **, std::string&, const std::string&, const Precision);
+
+// =================================================================================================
+
+// Returns only the precision argument
+Precision GetPrecision(const int argc, char *argv[]) {
+  auto dummy = std::string{};
+  return GetArgument(argc, argv, dummy, kArgPrecision, Precision::kSingle);
+}
+
+// =================================================================================================
+
+// Checks whether an argument is given. Returns true or false.
+bool CheckArgument(const int argc, char *argv[], std::string &help,
+                   const std::string &option) {
+
+  // Updates the help message
+  help += "    -"+option+"\n";
+
+  // Parses the argument. Note that this supports both the given option (e.g. -device) and one with
+  // an extra dash in front (e.g. --device).
+  for (int c=0; c<argc; ++c) {
+    auto item = std::string{argv[c]};
+    if (item.compare("-"+option) == 0 || item.compare("--"+option) == 0) { ++c; return true; }
+  }
+  return false;
+}
+
+// =================================================================================================
+
+// Returns a random seed. This used to be implemented using 'std::random_device', but that doesn't
+// always work. The chrono-timers are more reliable in that sense, but perhaps less random.
+unsigned int GetRandomSeed() {
+  return static_cast<unsigned int>(std::chrono::system_clock::now().time_since_epoch().count());
+}
+
+// Create a random number generator and populates a vector with samples from a random distribution
+template <typename T>
+void PopulateVector(std::vector<T> &vector) {
+  std::mt19937 mt(GetRandomSeed());
+  std::uniform_real_distribution<T> dist(static_cast<T>(-2.0), static_cast<T>(2.0));
+  for (auto &element: vector) { element = dist(mt); }
+}
+template void PopulateVector<float>(std::vector<float>&);
+template void PopulateVector<double>(std::vector<double>&);
+
+// Specialized versions of the above for complex data-types
+template <>
+void PopulateVector(std::vector<float2> &vector) {
+  std::mt19937 mt(GetRandomSeed());
+  std::uniform_real_distribution<float> dist(-2.0f, 2.0f);
+  for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
+}
+template <>
+void PopulateVector(std::vector<double2> &vector) {
+  std::mt19937 mt(GetRandomSeed());
+  std::uniform_real_distribution<double> dist(-2.0, 2.0);
+  for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
+}
+
+// =================================================================================================
+
+// Returns a scalar with a default value
+template <typename T>
+T GetScalar() {
+  return static_cast<T>(2.0);
+}
+template float GetScalar<float>();
+template double GetScalar<double>();
+
+// Specialized versions of the above for complex data-types
+template <>
+float2 GetScalar() {
+  return {2.0f, 0.5f};
+}
+template <>
+double2 GetScalar() {
+  return {2.0, 0.5};
+}
+
+// =================================================================================================
+
+// Rounding functions performing ceiling and division operations
+size_t CeilDiv(const size_t x, const size_t y) {
+  return 1 + ((x - 1) / y);
+}
+size_t Ceil(const size_t x, const size_t y) {
+  return CeilDiv(x,y)*y;
+}
+
+// Helper function to determine whether or not 'a' is a multiple of 'b'
+bool IsMultiple(const size_t a, const size_t b) {
+  return ((a/b)*b == a) ? true : false;
+};
+
+// =================================================================================================
+
+// Convert the precision enum (as integer) into bytes
+size_t GetBytes(const Precision precision) {
+  switch(precision) {
+    case Precision::kHalf: return 2;
+    case Precision::kSingle: return 4;
+    case Precision::kDouble: return 8;
+    case Precision::kComplexSingle: return 8;
+    case Precision::kComplexDouble: return 16;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/test/correctness/routines/xaxpy.cc b/test/correctness/routines/xaxpy.cc
new file mode 100644
index 00000000..aa90766e
--- /dev/null
+++ b/test/correctness/routines/xaxpy.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xaxpy routine. It is based on the TestXY class.
+//
+// =================================================================================================
+
+#include "wrapper_clblas.h"
+#include "correctness/testxy.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
+template <typename T>
+void XaxpyTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates the CLBlast lambda
+  auto clblast_lambda = [](const Arguments<T> &args,
+                           const Buffer &x_vec, const Buffer &y_vec,
+                           CommandQueue &queue) -> StatusCode {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    return Axpy(args.n, args.alpha,
+                x_vec(), args.x_offset, args.x_inc,
+                y_vec(), args.y_offset, args.y_inc,
+                &queue_plain, &event);
+  };
+
+  // Creates the clBLAS lambda (for comparison)
+  auto clblas_lambda = [](const Arguments<T> &args,
+                          const Buffer &x_vec, const Buffer &y_vec,
+                          CommandQueue &queue) -> StatusCode {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXaxpy(args.n, args.alpha,
+                              x_vec(), args.x_offset, args.x_inc,
+                              y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    return static_cast<StatusCode>(status);
+  };
+
+  // Selects the platform and device on which to test (command-line options)
+  auto help = std::string{"Options given/available:\n"};
+  const auto platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+  const auto device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+  if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); }
+
+  // Initializes the other arguments relevant for this routine
+  auto args = Arguments<T>{};
+  const auto options = std::vector<std::string>{kArgN, kArgXInc, kArgYInc,
+                                                kArgXOffset, kArgYOffset, kArgAlpha};
+
+  // Creates a tester
+  TestXY<T> tester{platform_id, device_id, name, options, clblast_lambda, clblas_lambda};
+
+  // Runs the tests
+  const auto case_name = "default";
+  tester.TestRegular(args, case_name);
+  tester.TestInvalidBufferSizes(args, case_name);
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::XaxpyTest<float>(argc, argv, false, "SAXPY");
+  clblast::XaxpyTest<double>(argc, argv, true, "DAXPY");
+  clblast::XaxpyTest<clblast::float2>(argc, argv, true, "CAXPY");
+  clblast::XaxpyTest<clblast::double2>(argc, argv, true, "ZAXPY");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/xgemm.cc b/test/correctness/routines/xgemm.cc
new file mode 100644
index 00000000..04525cc5
--- /dev/null
+++ b/test/correctness/routines/xgemm.cc
@@ -0,0 +1,104 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xgemm routine. It is based on the TestABC class.
+//
+// =================================================================================================
+
+#include "wrapper_clblas.h"
+#include "correctness/testabc.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
+template <typename T>
+void XgemmTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates the CLBlast lambda
+  auto clblast_lambda = [](const Arguments<T> &args,
+                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
+                           CommandQueue &queue) -> StatusCode {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    return Gemm(args.layout, args.a_transpose, args.b_transpose,
+                args.m, args.n, args.k,
+                args.alpha,
+                a_mat(), args.a_offset, args.a_ld,
+                b_mat(), args.b_offset, args.b_ld,
+                args.beta,
+                c_mat(), args.c_offset, args.c_ld,
+                &queue_plain, &event);
+  };
+
+  // Creates the clBLAS lambda (for comparison)
+  auto clblas_lambda = [](const Arguments<T> &args,
+                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
+                          CommandQueue &queue) -> StatusCode {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              static_cast<clblasTranspose>(args.b_transpose),
+                              args.m, args.n, args.k,
+                              args.alpha,
+                              a_mat(), args.a_offset, args.a_ld,
+                              b_mat(), args.b_offset, args.b_ld,
+                              args.beta,
+                              c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    return static_cast<StatusCode>(status);
+  };
+
+  // Selects the platform and device on which to test (command-line options)
+  auto help = std::string{"Options given/available:\n"};
+  const auto platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+  const auto device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+  if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); }
+
+  // Initializes the other arguments relevant for this routine
+  auto args = Arguments<T>{};
+  const auto options = std::vector<std::string>{kArgM, kArgN, kArgK, kArgLayout,
+                                                kArgATransp, kArgBTransp,
+                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+                                                kArgAOffset, kArgBOffset, kArgCOffset};
+
+  // Creates a tester
+  TestABC<T> tester{platform_id, device_id, name, options, clblast_lambda, clblas_lambda};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: {Layout::kRowMajor, Layout::kColMajor}) {
+    args.layout = layout;
+    for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) {
+      args.a_transpose = a_transpose;
+      for (auto &b_transpose: {Transpose::kNo, Transpose::kYes}) {
+        args.b_transpose = b_transpose;
+        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
+
+        // Runs the tests
+        tester.TestRegular(args, case_name);
+        tester.TestInvalidBufferSizes(args, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::XgemmTest<float>(argc, argv, false, "SGEMM");
+  clblast::XgemmTest<double>(argc, argv, true, "DGEMM");
+  //clblast::XgemmTest<float2>(argc, argv, true, "CGEMM");
+  //clblast::XgemmTest<double2>(argc, argv, true, "ZGEMM");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/xsymm.cc b/test/correctness/routines/xsymm.cc
new file mode 100644
index 00000000..9bcad253
--- /dev/null
+++ b/test/correctness/routines/xsymm.cc
@@ -0,0 +1,104 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsymm routine. It is based on the TestABC class.
+//
+// =================================================================================================
+
+#include "wrapper_clblas.h"
+#include "correctness/testabc.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
+template <typename T>
+void XsymmTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates the CLBlast lambda
+  auto clblast_lambda = [](const Arguments<T> &args,
+                           const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
+                           CommandQueue &queue) -> StatusCode {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    return Symm(args.layout, args.side, args.triangle,
+                args.m, args.n,
+                args.alpha,
+                a_mat(), args.a_offset, args.a_ld,
+                b_mat(), args.b_offset, args.b_ld,
+                args.beta,
+                c_mat(), args.c_offset, args.c_ld,
+                &queue_plain, &event);
+  };
+
+  // Creates the clBLAS lambda (for comparison)
+  auto clblas_lambda = [](const Arguments<T> &args,
+                          const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
+                          CommandQueue &queue) -> StatusCode {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasSide>(args.side),
+                              static_cast<clblasUplo>(args.triangle),
+                              args.m, args.n,
+                              args.alpha,
+                              a_mat(), args.a_offset, args.a_ld,
+                              b_mat(), args.b_offset, args.b_ld,
+                              args.beta,
+                              c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    return static_cast<StatusCode>(status);
+  };
+
+  // Selects the platform and device on which to test (command-line options)
+  auto help = std::string{"Options given/available:\n"};
+  const auto platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+  const auto device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+  if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); }
+
+  // Initializes the other arguments relevant for this routine
+  auto args = Arguments<T>{};
+  const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout,
+                                                kArgSide, kArgTriangle,
+                                                kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+                                                kArgAOffset, kArgBOffset, kArgCOffset};
+
+  // Creates a tester
+  TestABC<T> tester{platform_id, device_id, name, options, clblast_lambda, clblas_lambda};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: {Layout::kRowMajor, Layout::kColMajor}) {
+    args.layout = layout;
+    for (auto &side: {Side::kLeft, Side::kRight}) {
+      args.side = side;
+      for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
+        args.triangle = triangle;
+        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
+
+        // Runs the tests
+        tester.TestRegular(args, case_name);
+        tester.TestInvalidBufferSizes(args, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::XsymmTest<float>(argc, argv, false, "SSYMM");
+  clblast::XsymmTest<double>(argc, argv, true, "DSYMM");
+  //clblast::XsymmTest<float2>(argc, argv, true, "CSYMM");
+  //clblast::XsymmTest<double2>(argc, argv, true, "ZSYMM");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/testabc.cc b/test/correctness/testabc.cc
new file mode 100644
index 00000000..5d5869c8
--- /dev/null
+++ b/test/correctness/testabc.cc
@@ -0,0 +1,212 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the TestABC class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include <algorithm>
+
+#include "correctness/testabc.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor, initializes the base class tester and input data
+template <typename T>
+TestABC<T>::TestABC(const size_t platform_id, const size_t device_id,
+                    const std::string &name, const std::vector<std::string> &options,
+                    const Routine clblast_lambda, const Routine clblas_lambda):
+    Tester<T>{platform_id, device_id, name, options},
+    clblast_lambda_(clblast_lambda),
+    clblas_lambda_(clblas_lambda) {
+
+  // Computes the maximum sizes. This allows for a single set of input/output buffers.
+  auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
+
+  // Creates test input data
+  a_source_.resize(max_dim*max_ld + max_offset);
+  b_source_.resize(max_dim*max_ld + max_offset);
+  c_source_.resize(max_dim*max_ld + max_offset);
+  PopulateVector(a_source_);
+  PopulateVector(b_source_);
+  PopulateVector(c_source_);
+}
+
+// ===============================================================================================
+
+// Tests the routine for a wide variety of parameters
+template <typename T>
+void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
+  TestStart("regular behaviour", name);
+
+  // Computes whether or not the matrices are transposed. Note that we assume a default of
+  // column-major and no-transpose. If one of them is different (but not both), then rotated
+  // is considered true.
+  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                   (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+  auto c_rotated = (args.layout == Layout::kRowMajor);
+
+  // Iterates over the matrix dimensions
+  for (auto &m: kMatrixDims) {
+    args.m = m;
+    for (auto &n: kMatrixDims) {
+      args.n = n;
+      for (auto &k: kMatrixDims) {
+        args.k = k;
+
+        // Computes the second dimensions of the matrices taking the rotation into account
+        auto a_two = (a_rotated) ? m : k;
+        auto b_two = (b_rotated) ? k : n;
+        auto c_two = (c_rotated) ? m : n;
+
+        // Iterates over the leading-dimension values and the offsets
+        for (auto &a_ld: kMatrixDims) {
+          args.a_ld = a_ld;
+          for (auto &a_offset: kOffsets) {
+            args.a_offset = a_offset;
+            for (auto &b_ld: kMatrixDims) {
+              args.b_ld = b_ld;
+              for (auto &b_offset: kOffsets) {
+                args.b_offset = b_offset;
+                for (auto &c_ld: kMatrixDims) {
+                  args.c_ld = c_ld;
+                  for (auto &c_offset: kOffsets) {
+                    args.c_offset = c_offset;
+
+                    // Computes the buffer sizes
+                    auto a_size = a_two * a_ld + a_offset;
+                    auto b_size = b_two * b_ld + b_offset;
+                    auto c_size = c_two * c_ld + c_offset;
+                    if (a_size < 1 || b_size < 1 || c_size < 1) { continue; }
+
+                    // Creates the OpenCL buffers
+                    auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
+                    auto b_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
+                    auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
+                    auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
+
+                    // Iterates over the values for alpha and beta
+                    for (auto &alpha: kAlphaValues) {
+                      args.alpha = alpha;
+                      for (auto &beta: kBetaValues) {
+                        args.beta = beta;
+
+                        // Runs the reference clBLAS code
+                        a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
+                        b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
+                        r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
+                        auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
+
+                        // Runs the CLBlast code
+                        a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
+                        b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
+                        s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
+                        auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
+
+                        // Tests for equality of the two status codes
+                        if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
+                          TestErrorCodes(status1, status2, args);
+                          continue;
+                        }
+
+                        // Downloads the results
+                        std::vector<T> r_result(c_size, static_cast<T>(0));
+                        std::vector<T> s_result(c_size, static_cast<T>(0));
+                        r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result);
+                        s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result);
+
+                        // Checks for differences in the output
+                        auto errors = size_t{0};
+                        for (auto idm=size_t{0}; idm<m; ++idm) {
+                          for (auto idn=size_t{0}; idn<n; ++idn) {
+                            auto index = (args.layout == Layout::kRowMajor) ?
+                                          idm*args.c_ld + idn + args.c_offset:
+                                          idn*args.c_ld + idm + args.c_offset;
+                            if (!TestSimilarity(r_result[index], s_result[index], kErrorMargin)) {
+                              errors++;
+                            }
+                          }
+                        }
+
+                        // Tests the error count (should be zero)
+                        TestErrorCount(errors, m*n, args);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
+// does not test for results (if any).
+template <typename T>
+void TestABC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
+  TestStart("invalid buffer sizes", name);
+
+  // Sets example test parameters
+  args.m = kBufferSize;
+  args.n = kBufferSize;
+  args.k = kBufferSize;
+  args.a_ld = kBufferSize;
+  args.b_ld = kBufferSize;
+  args.c_ld = kBufferSize;
+
+  // Iterates over test buffer sizes
+  const std::vector<size_t> kBufferSizes = {0, kBufferSize - 1, kBufferSize};
+  for (auto &a_size: kBufferSizes) {
+    for (auto &b_size: kBufferSizes) {
+      for (auto &c_size: kBufferSizes) {
+
+        // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
+        // want to be able to create invalid buffers (no error checking here).
+        auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
+        auto a_mat = Buffer(a);
+        auto b = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
+        auto b_mat = Buffer(b);
+        auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
+        auto r_mat = Buffer(r);
+        auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
+        auto s_mat = Buffer(s);
+
+        // Runs the two routines
+        auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
+        auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
+
+        // Tests for equality of the two status codes
+        TestErrorCodes(status1, status2, args);
+      }
+    }
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class TestABC<float>;
+template class TestABC<double>;
+template class TestABC<float2>;
+template class TestABC<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/test/correctness/testabc.h b/test/correctness/testabc.h
new file mode 100644
index 00000000..bb06ea22
--- /dev/null
+++ b/test/correctness/testabc.h
@@ -0,0 +1,94 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file tests any mat-mat-mat (A,B,C) routine. It contains two types of tests: one testing
+// all sorts of input combinations, and one deliberatly testing with invalid values.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_CORRECTNESS_TESTABC_H_
+#define CLBLAST_TEST_CORRECTNESS_TESTABC_H_
+
+#include <vector>
+#include <string>
+
+#include "correctness/tester.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Defines the parameters that delineate individual test-cases
+struct Parameters {
+  Layout layout;
+  Transpose a_transpose;
+  Transpose b_transpose;
+  std::string GetString() const {
+    return "Layout: "+ToString(layout)+", A: "+ToString(a_transpose)+
+                                       ", B: "+ToString(b_transpose);
+  }
+};
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestABC: public Tester<T> {
+ public:
+
+  // Uses several variables from the Tester class
+  using Tester<T>::context_;
+  using Tester<T>::queue_;
+  using Tester<T>::kErrorMargin;
+
+  // Uses several helper functions from the Tester class
+  using Tester<T>::TestStart;
+  using Tester<T>::TestEnd;
+  using Tester<T>::TestSimilarity;
+  using Tester<T>::TestErrorCount;
+  using Tester<T>::TestErrorCodes;
+  using Tester<T>::GetExampleScalars;
+
+  // Test settings for the regular test. Append to this list in case more tests are required.
+  const std::vector<size_t> kMatrixDims = { 7, 64 };
+  const std::vector<size_t> kOffsets = { 0 };
+  const std::vector<T> kAlphaValues = GetExampleScalars();
+  const std::vector<T> kBetaValues = GetExampleScalars();
+
+  // Test settings for the invalid test
+  const size_t kBufferSize = 64;
+
+  // Shorthand for a BLAS routine
+  using Routine = std::function<StatusCode(const Arguments<T>&,
+                                           const Buffer&, const Buffer&, const Buffer&,
+                                           CommandQueue&)>;
+
+  // Constructor, initializes the base class tester and input data
+  TestABC(const size_t platform_id, const size_t device_id,
+          const std::string &name, const std::vector<std::string> &options,
+          const Routine clblast_lambda, const Routine clblas_lambda);
+
+  // The test functions, taking no inputs
+  void TestRegular(Arguments<T> &args, const std::string &name);
+  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
+
+ private:
+
+  // Source data to test with
+  std::vector<T> a_source_;
+  std::vector<T> b_source_;
+  std::vector<T> c_source_;
+  
+  // The routines to test
+  Routine clblast_lambda_;
+  Routine clblas_lambda_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_CORRECTNESS_TESTABC_H_
+#endif
diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc
new file mode 100644
index 00000000..da1cb152
--- /dev/null
+++ b/test/correctness/tester.cc
@@ -0,0 +1,307 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Tester class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "correctness/tester.h"
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <limits>
+
+namespace clblast {
+// =================================================================================================
+
+// General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up
+// the clBLAS library for reference.
+template <typename T>
+Tester<T>::Tester(const size_t platform_id, const size_t device_id,
+                  const std::string &name, const std::vector<std::string> &options):
+    platform_(Platform(platform_id)),
+    device_(Device(platform_, kDeviceType, device_id)),
+    context_(Context(device_)),
+    queue_(CommandQueue(context_, device_)),
+    error_log_{},
+    num_passed_{0},
+    num_skipped_{0},
+    num_errors_{0},
+    print_count_{0},
+    tests_failed_{0},
+    tests_passed_{0},
+    options_{options} {
+
+  // Prints the header
+  fprintf(stdout, "* Running on OpenCL device '%s'.\n", device_.Name().c_str());
+  fprintf(stdout, "* Starting tests for the %s'%s'%s routine. Legend:\n",
+          kPrintMessage.c_str(), name.c_str(), kPrintEnd.c_str());
+  fprintf(stdout, "   %s -> Test produced correct results\n", kSuccessData.c_str());
+  fprintf(stdout, "   %s -> Test returned the correct error code\n", kSuccessStatus.c_str());
+  fprintf(stdout, "   %s -> Test produced incorrect results\n", kErrorData.c_str());
+  fprintf(stdout, "   %s -> Test returned an incorrect error code\n", kErrorStatus.c_str());
+  fprintf(stdout, "   %s -> Test not executed: OpenCL-kernel compilation error\n",
+          kSkippedCompilation.c_str());
+  fprintf(stdout, "   %s -> Test not executed: Unsupported precision\n",
+          kUnsupportedPrecision.c_str());
+
+  // Initializes clBLAS
+  auto status = clblasSetup();
+  if (status != CL_SUCCESS) {
+    throw std::runtime_error("clBLAS setup error: "+ToString(static_cast<int>(status)));
+  }
+}
+
+// Destructor prints the summary of the test cases and cleans-up the clBLAS library
+template <typename T>
+Tester<T>::~Tester() {
+  fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
+  fprintf(stdout, "   %lu test(s) succeeded\n", tests_passed_);
+  if (tests_failed_ != 0) {
+    fprintf(stdout, "   %s%lu test(s) failed%s\n",
+            kPrintError.c_str(), tests_failed_, kPrintEnd.c_str());
+  }
+  else {
+    fprintf(stdout, "   %lu test(s) failed\n", tests_failed_);
+  }
+  fprintf(stdout, "\n");
+  clblasTeardown();
+}
+
+// =================================================================================================
+
+// Function called at the start of each test. This prints a header with information about the
+// test and re-initializes all test data-structures.
+template <typename T>
+void Tester<T>::TestStart(const std::string &test_name, const std::string &test_configuration) {
+
+  // Prints the header
+  fprintf(stdout, "* Testing %s'%s'%s for %s'%s'%s:\n",
+          kPrintMessage.c_str(), test_name.c_str(), kPrintEnd.c_str(),
+          kPrintMessage.c_str(), test_configuration.c_str(), kPrintEnd.c_str());
+  fprintf(stdout, "   ");
+
+  // Empties the error log and the error/pass counters
+  error_log_.clear();
+  num_passed_ = 0;
+  num_skipped_ = 0;
+  num_errors_ = 0;
+  print_count_ = 0;
+}
+
+// Function called at the end of each test. This prints errors if any occured. It also prints a
+// summary of the number of sub-tests passed/failed.
+template <typename T>
+void Tester<T>::TestEnd() {
+  fprintf(stdout, "\n");
+  if (error_log_.size() == 0) { tests_passed_++; } else { tests_failed_++; }
+
+  // Prints details of all error occurences for these tests
+  for (auto &entry: error_log_) {
+    if (entry.error_percentage != kStatusError) {
+      fprintf(stdout, "   Error rate %.1lf%%: ", entry.error_percentage);
+    }
+    else {
+      fprintf(stdout, "   Status code %d (expected %d): ", entry.status_found, entry.status_expect);
+    }
+    for (auto &o: options_) {
+      if (o == kArgM)        { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); }
+      if (o == kArgN)        { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); }
+      if (o == kArgK)        { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); }
+      if (o == kArgLayout)   { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);}
+      if (o == kArgATransp)  { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);}
+      if (o == kArgBTransp)  { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
+      if (o == kArgSide)     { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
+      if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
+      if (o == kArgXInc)     { fprintf(stdout, "%s=%lu ", kArgXInc, entry.args.x_inc);}
+      if (o == kArgYInc)     { fprintf(stdout, "%s=%lu ", kArgYInc, entry.args.y_inc);}
+      if (o == kArgXOffset)  { fprintf(stdout, "%s=%lu ", kArgXOffset, entry.args.x_offset);}
+      if (o == kArgYOffset)  { fprintf(stdout, "%s=%lu ", kArgYOffset, entry.args.y_offset);}
+      if (o == kArgALeadDim) { fprintf(stdout, "%s=%lu ", kArgALeadDim, entry.args.a_ld);}
+      if (o == kArgBLeadDim) { fprintf(stdout, "%s=%lu ", kArgBLeadDim, entry.args.b_ld);}
+      if (o == kArgCLeadDim) { fprintf(stdout, "%s=%lu ", kArgCLeadDim, entry.args.c_ld);}
+      if (o == kArgAOffset)  { fprintf(stdout, "%s=%lu ", kArgAOffset, entry.args.a_offset);}
+      if (o == kArgBOffset)  { fprintf(stdout, "%s=%lu ", kArgBOffset, entry.args.b_offset);}
+      if (o == kArgCOffset)  { fprintf(stdout, "%s=%lu ", kArgCOffset, entry.args.c_offset);}
+    }
+    fprintf(stdout, "\n");
+  }
+
+  // Prints a test summary
+  auto pass_rate = 100*num_passed_ / static_cast<float>(num_passed_ + num_skipped_ + num_errors_);
+  fprintf(stdout, "   Pass rate %s%5.1lf%%%s:", kPrintMessage.c_str(), pass_rate, kPrintEnd.c_str());
+  fprintf(stdout, " %lu passed /", num_passed_);
+  if (num_skipped_ != 0) {
+    fprintf(stdout, " %s%lu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str());
+  }
+  else {
+    fprintf(stdout, " %lu skipped /", num_skipped_);
+  }
+  if (num_errors_ != 0) {
+    fprintf(stdout, " %s%lu failed%s\n", kPrintError.c_str(), num_errors_, kPrintEnd.c_str());
+  }
+  else {
+    fprintf(stdout, " %lu failed\n", num_errors_);
+  }
+}
+
+// =================================================================================================
+
+// Compares two floating point values and returns whether they are within an acceptable error
+// margin. This replaces GTest's EXPECT_NEAR().
+template <typename T>
+bool Tester<T>::TestSimilarity(const T val1, const T val2, const double margin) {
+  const auto difference = std::fabs(val1 - val2);
+
+  // Shortcut, handles infinities
+  if (val1 == val2) {
+    return true;
+  }
+  // The values are zero or both are extremely close to it relative error is less meaningful
+  else if (val1 == 0 || val2 == 0 || difference < std::numeric_limits<T>::min()) {
+    return difference < (static_cast<T>(margin) * std::numeric_limits<T>::min());
+  }
+  // Use relative error
+  else {
+    return (difference / (std::fabs(val1) + std::fabs(val2))) < static_cast<T>(margin);
+  }
+}
+
+// Specialisations for complex data-types
+template <>
+bool Tester<float2>::TestSimilarity(const float2 val1, const float2 val2, const double margin) {
+  auto real = Tester<float>::TestSimilarity(val1.real(), val2.real(), margin);
+  auto imag = Tester<float>::TestSimilarity(val1.imag(), val2.imag(), margin);
+  return (real && imag);
+}
+template <>
+bool Tester<double2>::TestSimilarity(const double2 val1, const double2 val2, const double margin) {
+  auto real = Tester<double>::TestSimilarity(val1.real(), val2.real(), margin);
+  auto imag = Tester<double>::TestSimilarity(val1.imag(), val2.imag(), margin);
+  return (real && imag);
+}
+
+// =================================================================================================
+
+// Handles a 'pass' or 'error' depending on whether there are any errors
+template <typename T>
+void Tester<T>::TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args) {
+
+  // Finished successfully
+  if (errors == 0) {
+    PrintTestResult(kSuccessData);
+    ReportPass();
+  }
+
+  // Error(s) occurred
+  else {
+    auto percentage = 100*errors / static_cast<float>(size);
+    PrintTestResult(kErrorData);
+    ReportError({StatusCode::kSuccess, StatusCode::kSuccess, percentage, args});
+  }
+}
+
+// Compares two status codes for equality. The outcome can be a pass (they are the same), a warning
+// (CLBlast reported a compilation error), or an error (they are different).
+template <typename T>
+void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
+                            const Arguments<T> &args) {
+
+  // Finished successfully
+  if (clblas_status == clblast_status) {
+    PrintTestResult(kSuccessStatus);
+    ReportPass();
+  }
+
+  // No support for this kind of precision
+  else if (clblast_status == StatusCode::kNoDoublePrecision ||
+           clblast_status == StatusCode::kNoHalfPrecision) {
+    PrintTestResult(kUnsupportedPrecision);
+    ReportSkipped();
+  }
+
+  // Could not compile the CLBlast kernel properly
+  else if (clblast_status == StatusCode::kBuildProgramFailure ||
+           clblast_status == StatusCode::kNotImplemented) {
+    PrintTestResult(kSkippedCompilation);
+    ReportSkipped();
+  }
+
+  // Error occurred
+  else {
+    PrintTestResult(kErrorStatus);
+    ReportError({clblas_status, clblast_status, kStatusError, args});
+  }
+}
+
+// =================================================================================================
+
+// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
+// routines. This function is specialised for the different data-types.
+template <>
+const std::vector<float> Tester<float>::GetExampleScalars() {
+  return {0.0f, 1.0f, 3.14f};
+}
+template <>
+const std::vector<double> Tester<double>::GetExampleScalars() {
+  return {0.0, 1.0, 3.14};
+}
+template <>
+const std::vector<float2> Tester<float2>::GetExampleScalars() {
+  return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}};
+}
+template <>
+const std::vector<double2> Tester<double2>::GetExampleScalars() {
+  return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}};
+}
+
+// =================================================================================================
+
+// A test can either pass, be skipped, or fail
+template <typename T>
+void Tester<T>::ReportPass() {
+  num_passed_++;
+}
+template <typename T>
+void Tester<T>::ReportSkipped() {
+  num_skipped_++;
+}
+template <typename T>
+void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {
+  error_log_.push_back(error_log_entry);
+  num_errors_++;
+}
+
+// =================================================================================================
+
+// Prints the test-result symbol to screen. This function limits the maximum number of symbols per
+// line by printing newlines once every so many calls.
+template <typename T>
+void Tester<T>::PrintTestResult(const std::string &message) {
+  if (print_count_ == kResultsPerLine) {
+    print_count_ = 0;
+    fprintf(stdout, "\n   ");
+  }
+  fprintf(stdout, "%s", message.c_str());
+  std::cout << std::flush;
+  print_count_++;
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Tester<float>;
+template class Tester<double>;
+template class Tester<float2>;
+template class Tester<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/test/correctness/tester.h b/test/correctness/tester.h
new file mode 100644
index 00000000..12f6125a
--- /dev/null
+++ b/test/correctness/tester.h
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Tester class, providing a test-framework. GTest was used before, but
+// was not able to handle certain cases (e.g. template type + parameters). This is its (basic)
+// custom replacement.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_CORRECTNESS_TESTER_H_
+#define CLBLAST_TEST_CORRECTNESS_TESTER_H_
+
+#include <string>
+#include <vector>
+#include <memory>
+
+// The libraries
+#include <clBLAS.h>
+#include "clblast.h"
+
+#include "internal/utilities.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Tester {
+ public:
+
+  // Types of devices to consider
+  const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
+
+  // Maximum number of test results printed on a single line
+  static constexpr auto kResultsPerLine = size_t{64};
+
+  // Error percentage is not applicable: error was caused by an incorrect status
+  static constexpr auto kStatusError = -1.0f;
+
+  // Set the allowed error margin for floating-point comparisons
+  static constexpr auto kErrorMargin = 1.0e-2;
+
+  // Constants holding start and end strings for terminal-output in colour
+  const std::string kPrintError{"\x1b[31m"};
+  const std::string kPrintSuccess{"\x1b[32m"};
+  const std::string kPrintWarning{"\x1b[35m"};
+  const std::string kPrintMessage{"\x1b[1m"};
+  const std::string kPrintEnd{"\x1b[0m"};
+
+  // Sets the output error coding
+  const std::string kSuccessData{kPrintSuccess + ":" + kPrintEnd};
+  const std::string kSuccessStatus{kPrintSuccess + "." + kPrintEnd};
+  const std::string kErrorData{kPrintError + "X" + kPrintEnd};
+  const std::string kErrorStatus{kPrintError + "/" + kPrintEnd};
+  const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
+  const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};
+
+  // This structure combines the above log-entry with a status code an error percentage
+  struct ErrorLogEntry {
+    StatusCode status_expect;
+    StatusCode status_found;
+    float error_percentage;
+    Arguments<T> args;
+  };
+
+  // Creates an instance of the tester, running on a particular OpenCL platform and device. It
+  // takes the routine's names as an additional parameter.
+  explicit Tester(const size_t platform_id, const size_t device_id,
+                  const std::string &name, const std::vector<std::string> &options);
+  ~Tester();
+
+  // These methods start and end a test-case. Within a test-case, multiple tests can be run.
+  void TestStart(const std::string &test_name, const std::string &test_configuration);
+  void TestEnd();
+
+  // Compares two floating point values for similarity. Allows for a certain relative error margin.
+  static bool TestSimilarity(const T val1, const T val2, const double margin);
+
+  // Tests either an error count (should be zero) or two error codes (must match)
+  void TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args);
+  void TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
+                      const Arguments<T> &args);
+
+ protected:
+
+  // Retrieves a list of example scalars of the right type
+  const std::vector<T> GetExampleScalars();
+
+  // The OpenCL objects (accessible by derived classes)
+  Platform platform_;
+  Device device_;
+  Context context_;
+  CommandQueue queue_;
+
+ private:
+
+  // Internal methods to report a passed, skipped, or failed test
+  void ReportPass();
+  void ReportSkipped();
+  void ReportError(const ErrorLogEntry &log_entry);
+
+  // Prints the error or success symbol to screen
+  void PrintTestResult(const std::string &message);
+
+  // Logging and counting occurrences of errors
+  std::vector<ErrorLogEntry> error_log_;
+  size_t num_passed_;
+  size_t num_skipped_;
+  size_t num_errors_;
+
+  // Counting the amount of errors printed on this row
+  size_t print_count_;
+
+  // Counting the number of test-cases with and without failures
+  size_t tests_failed_;
+  size_t tests_passed_;
+
+  // Arguments relevant for a specific routine
+  std::vector<std::string> options_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_CORRECTNESS_TESTER_H_
+#endif
diff --git a/test/correctness/testxy.cc b/test/correctness/testxy.cc
new file mode 100644
index 00000000..0b708b3d
--- /dev/null
+++ b/test/correctness/testxy.cc
@@ -0,0 +1,172 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the TestXY class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include <algorithm>
+
+#include "correctness/testxy.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor, initializes the base class tester and input data
+template <typename T>
+TestXY<T>::TestXY(const size_t platform_id, const size_t device_id,
+                  const std::string &name, const std::vector<std::string> &options,
+                  const Routine clblast_lambda, const Routine clblas_lambda):
+    Tester<T>{platform_id, device_id, name, options},
+    clblast_lambda_(clblast_lambda),
+    clblas_lambda_(clblas_lambda) {
+
+  // Computes the maximum sizes. This allows for a single set of input/output buffers.
+  auto max_dim = *std::max_element(kVectorDims.begin(), kVectorDims.end());
+  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
+  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
+
+  // Creates test input data
+  x_source_.resize(max_dim*max_inc + max_offset);
+  y_source_.resize(max_dim*max_inc + max_offset);
+  PopulateVector(x_source_);
+  PopulateVector(y_source_);
+}
+
+// ===============================================================================================
+
+// Tests the routine for a wide variety of parameters
+template <typename T>
+void TestXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
+  TestStart("regular behaviour", name);
+
+  // Iterates over the vector dimension
+  for (auto &n: kVectorDims) {
+    args.n = n;
+
+    // Iterates over the increment-values and the offsets
+    for (auto &x_inc: kIncrements) {
+      args.x_inc = x_inc;
+      for (auto &x_offset: kOffsets) {
+        args.x_offset = x_offset;
+        for (auto &y_inc: kIncrements) {
+          args.y_inc = y_inc;
+          for (auto &y_offset: kOffsets) {
+            args.y_offset = y_offset;
+
+            // Computes the buffer sizes
+            auto x_size = n * x_inc + x_offset;
+            auto y_size = n * y_inc + y_offset;
+            if (x_size < 1 || y_size < 1) { continue; }
+
+            // Creates the OpenCL buffers
+            auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
+            auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
+            auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
+
+            // Iterates over the values for alpha
+            for (auto &alpha: kAlphaValues) {
+              args.alpha = alpha;
+
+              // Runs the reference clBLAS code
+              x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
+              r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
+              auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
+
+              // Runs the CLBlast code
+              x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
+              s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
+              auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
+
+              // Tests for equality of the two status codes
+              if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
+                TestErrorCodes(status1, status2, args);
+                continue;
+              }
+
+              // Downloads the results
+              std::vector<T> r_result(y_size, static_cast<T>(0));
+              std::vector<T> s_result(y_size, static_cast<T>(0));
+              r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
+              s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
+
+              // Checks for differences in the output
+              auto errors = size_t{0};
+              for (auto idn=size_t{0}; idn<n; ++idn) {
+                auto index = idn*y_inc + y_offset;
+                if (!TestSimilarity(r_result[index], s_result[index], kErrorMargin)) {
+                  errors++;
+                }
+              }
+
+              // Tests the error count (should be zero)
+              TestErrorCount(errors, n, args);
+            }
+          }
+        }
+      }
+    }
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
+// does not test for results (if any).
+template <typename T>
+void TestXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
+  TestStart("invalid buffer sizes", name);
+
+  // Sets example test parameters
+  args.n = kBufferSize;
+
+  // Iterates over test buffer sizes
+  const std::vector<size_t> kBufferSizes = {0, kBufferSize - 1, kBufferSize};
+  for (auto &x_size: kBufferSizes) {
+    for (auto &y_size: kBufferSizes) {
+
+      // Iterates over test increments
+      for (auto &x_inc: kInvalidIncrements) {
+        args.x_inc = x_inc;
+        for (auto &y_inc: kInvalidIncrements) {
+          args.y_inc = y_inc;
+
+          // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
+          // want to be able to create invalid buffers (no error checking here).
+          auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
+          auto x_vec = Buffer(x);
+          auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
+          auto r_vec = Buffer(r);
+          auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
+          auto s_vec = Buffer(s);
+
+          // Runs the two routines
+          auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
+          auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
+
+          // Tests for equality of the two status codes
+          TestErrorCodes(status1, status2, args);
+        }
+      }
+    }
+  }
+  TestEnd();
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class TestXY<float>;
+template class TestXY<double>;
+template class TestXY<float2>;
+template class TestXY<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/test/correctness/testxy.h b/test/correctness/testxy.h
new file mode 100644
index 00000000..32cd91fa
--- /dev/null
+++ b/test/correctness/testxy.h
@@ -0,0 +1,83 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file tests any vector-vector (X,Y) routine. It contains two types of tests: one testing
+// all sorts of input combinations, and one deliberatly testing with invalid values.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_CORRECTNESS_TESTXY_H_
+#define CLBLAST_TEST_CORRECTNESS_TESTXY_H_
+
+#include <vector>
+#include <string>
+
+#include "correctness/tester.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXY: public Tester<T> {
+ public:
+
+  // Uses several variables from the Tester class
+  using Tester<T>::context_;
+  using Tester<T>::queue_;
+  using Tester<T>::kErrorMargin;
+
+  // Uses several helper functions from the Tester class
+  using Tester<T>::TestStart;
+  using Tester<T>::TestEnd;
+  using Tester<T>::TestSimilarity;
+  using Tester<T>::TestErrorCount;
+  using Tester<T>::TestErrorCodes;
+  using Tester<T>::GetExampleScalars;
+
+  // Test settings for the regular test. Append to this list in case more tests are required.
+  const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
+  const std::vector<size_t> kOffsets = { 0, 10 };
+  const std::vector<size_t> kIncrements = { 1, 2 };
+  const std::vector<T> kAlphaValues = GetExampleScalars();
+
+  // Test settings for the invalid test
+  const std::vector<size_t> kInvalidIncrements = { 0, 1 };
+  const size_t kBufferSize = 512;
+
+  // Shorthand for a BLAS routine
+  using Routine = std::function<StatusCode(const Arguments<T>&,
+                                           const Buffer&, const Buffer&,
+                                           CommandQueue&)>;
+
+  // Constructor, initializes the base class tester and input data
+  TestXY(const size_t platform_id, const size_t device_id,
+         const std::string &name, const std::vector<std::string> &options,
+         const Routine clblast_lambda, const Routine clblas_lambda);
+
+  // The test functions, taking no inputs
+  void TestRegular(Arguments<T> &args, const std::string &name);
+  void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
+
+ private:
+
+  // Source data to test with
+  std::vector<T> x_source_;
+  std::vector<T> y_source_;
+  
+  // The routines to test
+  Routine clblast_lambda_;
+  Routine clblas_lambda_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_CORRECTNESS_TESTXY_H_
+#endif
diff --git a/test/performance/client.cc b/test/performance/client.cc
new file mode 100644
index 00000000..ddaea0e1
--- /dev/null
+++ b/test/performance/client.cc
@@ -0,0 +1,295 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the common functions for the client-test environment.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <chrono>
+
+namespace clblast {
+// =================================================================================================
+
+// This is the vector-vector variant of the set-up/tear-down client routine.
+template <typename T>
+void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
+              const std::vector<std::string> &options) {
+
+  // Simple command line argument parser with defaults
+  auto args = ParseArguments<T>(argc, argv, options);
+  if (args.print_help) { return; }
+
+  // Prints the header of the output table
+  PrintTableHeader(args.silent, options);
+
+  // Initializes OpenCL and the libraries
+  auto platform = Platform(args.platform_id);
+  auto device = Device(platform, kDeviceType, args.device_id);
+  auto context = Context(device);
+  auto queue = CommandQueue(context, device);
+  if (args.compare_clblas) { clblasSetup(); }
+
+  // Iterates over all "num_step" values jumping by "step" each time
+  auto s = size_t{0};
+  while(true) {
+
+    // Computes the data sizes
+    auto x_size = args.n*args.x_inc + args.x_offset;
+    auto y_size = args.n*args.y_inc + args.y_offset;
+
+    // Populates input host vectors with random data
+    std::vector<T> x_source(x_size);
+    std::vector<T> y_source(y_size);
+    PopulateVector(x_source);
+    PopulateVector(y_source);
+
+    // Creates the vectors on the device
+    auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
+    auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
+    x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
+    y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
+
+    // Runs the routine-specific code
+    client_routine(args, x_buffer, y_buffer, queue);
+
+    // Makes the jump to the next step
+    ++s;
+    if (s >= args.num_steps) { break; }
+    args.n += args.step;
+  }
+
+  // Cleans-up and returns
+  if (args.compare_clblas) { clblasTeardown(); }
+}
+
+// Compiles the above function
+template void ClientXY<float>(int, char **, Routine2<float>, const std::vector<std::string>&);
+template void ClientXY<double>(int, char **, Routine2<double>, const std::vector<std::string>&);
+template void ClientXY<float2>(int, char **, Routine2<float2>, const std::vector<std::string>&);
+template void ClientXY<double2>(int, char **, Routine2<double2>, const std::vector<std::string>&);
+
+// =================================================================================================
+
+// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
+template <typename T>
+void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
+                     const std::vector<std::string> &options) {
+
+  // Simple command line argument parser with defaults
+  auto args = ParseArguments<T>(argc, argv, options);
+  if (args.print_help) { return; }
+
+  // Prints the header of the output table
+  PrintTableHeader(args.silent, options);
+
+  // Initializes OpenCL and the libraries
+  auto platform = Platform(args.platform_id);
+  auto device = Device(platform, kDeviceType, args.device_id);
+  auto context = Context(device);
+  auto queue = CommandQueue(context, device);
+  if (args.compare_clblas) { clblasSetup(); }
+
+  // Computes whether or not the matrices are transposed. Note that we assume a default of
+  // column-major and no-transpose. If one of them is different (but not both), then rotated
+  // is considered true.
+  auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose == Transpose::kYes) ||
+                   (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+  auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose == Transpose::kYes) ||
+                   (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+  auto c_rotated = (args.layout == Layout::kRowMajor);
+
+  // Iterates over all "num_step" values jumping by "step" each time
+  auto s = size_t{0};
+  while(true) {
+
+    // Computes the data sizes
+    auto a_two = (a_rotated) ? args.m : args.k;
+    auto b_two = (b_rotated) ? args.k : args.n;
+    auto c_two = (c_rotated) ? args.m : args.n;
+    auto a_size = a_two * args.a_ld + args.a_offset;
+    auto b_size = b_two * args.b_ld + args.b_offset;
+    auto c_size = c_two * args.c_ld + args.c_offset;
+
+    // Populates input host matrices with random data
+    std::vector<T> a_source(a_size);
+    std::vector<T> b_source(b_size);
+    std::vector<T> c_source(c_size);
+    PopulateVector(a_source);
+    PopulateVector(b_source);
+    PopulateVector(c_source);
+
+    // Creates the matrices on the device
+    auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
+    auto b_buffer = Buffer(context, CL_MEM_READ_WRITE, b_size*sizeof(T));
+    auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T));
+    a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
+    b_buffer.WriteBuffer(queue, b_size*sizeof(T), b_source);
+    c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source);
+
+    // Runs the routine-specific code
+    client_routine(args, a_buffer, b_buffer, c_buffer, queue);
+
+    // Makes the jump to the next step
+    ++s;
+    if (s >= args.num_steps) { break; }
+    args.m += args.step;
+    args.n += args.step;
+    args.k += args.step;
+    args.a_ld += args.step;
+    args.b_ld += args.step;
+    args.c_ld += args.step;
+  }
+
+  // Cleans-up and returns
+  if (args.compare_clblas) { clblasTeardown(); }
+}
+
+// Compiles the above function
+template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
+template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
+template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
+template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
+
+// =================================================================================================
+
+// Parses all arguments available for the CLBlast client testers. Some arguments might not be
+// applicable, but are searched for anyway to be able to create one common argument parser. All
+// arguments have a default value in case they are not found.
+template <typename T>
+Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options) {
+  auto args = Arguments<T>{};
+  auto help = std::string{"Options given/available:\n"};
+
+  // These are the options which are not for every client: they are optional
+  for (auto &o: options) {
+
+    // Data-sizes
+    if (o == kArgM) { args.m = args.k  = GetArgument(argc, argv, help, kArgM, 512UL); }
+    if (o == kArgN) { args.n           = GetArgument(argc, argv, help, kArgN, 512UL); }
+    if (o == kArgK) { args.k           = GetArgument(argc, argv, help, kArgK, 512UL); }
+
+    // Data-layouts
+    if (o == kArgLayout)   { args.layout      = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
+    if (o == kArgATransp)  { args.a_transpose = GetArgument(argc, argv, help, kArgATransp, Transpose::kNo); }
+    if (o == kArgBTransp)  { args.b_transpose = GetArgument(argc, argv, help, kArgBTransp, Transpose::kNo); }
+    if (o == kArgSide)     { args.side        = GetArgument(argc, argv, help, kArgSide, Side::kLeft); }
+    if (o == kArgTriangle) { args.triangle    = GetArgument(argc, argv, help, kArgTriangle, Triangle::kUpper); }
+
+    // Vector arguments
+    if (o == kArgXInc)    { args.x_inc    = GetArgument(argc, argv, help, kArgXInc, size_t{1}); }
+    if (o == kArgYInc)    { args.y_inc    = GetArgument(argc, argv, help, kArgYInc, size_t{1}); }
+    if (o == kArgXOffset) { args.x_offset = GetArgument(argc, argv, help, kArgXOffset, size_t{0}); }
+    if (o == kArgYOffset) { args.y_offset = GetArgument(argc, argv, help, kArgYOffset, size_t{0}); }
+
+    // Matrix arguments
+    if (o == kArgALeadDim) { args.a_ld     = GetArgument(argc, argv, help, kArgALeadDim, args.k); }
+    if (o == kArgBLeadDim) { args.b_ld     = GetArgument(argc, argv, help, kArgBLeadDim, args.n); }
+    if (o == kArgCLeadDim) { args.c_ld     = GetArgument(argc, argv, help, kArgCLeadDim, args.n); }
+    if (o == kArgAOffset)  { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); }
+    if (o == kArgBOffset)  { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); }
+    if (o == kArgCOffset)  { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); }
+
+    // Scalar values 
+    if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); }
+    if (o == kArgBeta)  { args.beta  = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
+  }
+
+  // These are the options common to all routines
+  args.platform_id    = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+  args.device_id      = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+  args.precision      = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+  args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, true);
+  args.step           = GetArgument(argc, argv, help, kArgStepSize, size_t{1});
+  args.num_steps      = GetArgument(argc, argv, help, kArgNumSteps, size_t{0});
+  args.num_runs       = GetArgument(argc, argv, help, kArgNumRuns, size_t{10});
+  args.print_help     = CheckArgument(argc, argv, help, kArgHelp);
+  args.silent         = CheckArgument(argc, argv, help, kArgQuiet);
+  args.no_abbrv       = CheckArgument(argc, argv, help, kArgNoAbbreviations);
+
+  // Prints the chosen (or defaulted) arguments to screen. This also serves as the help message,
+  // which is thus always displayed (unless silence is specified).
+  if (!args.silent) { fprintf(stdout, "%s\n", help.c_str()); }
+
+  // Returns the arguments
+  return args;
+}
+
+// =================================================================================================
+
+// Creates a vector of timing results, filled with execution times of the 'main computation'. The
+// timing is performed using the milliseconds chrono functions. The function returns the minimum
+// value found in the vector of timing results. The return value is in milliseconds.
+double TimedExecution(const size_t num_runs, std::function<void()> main_computation) {
+  auto timings = std::vector<double>(num_runs);
+  for (auto &timing: timings) {
+    auto start_time = std::chrono::steady_clock::now();
+
+    // Executes the main computation
+    main_computation();
+
+    // Records and stores the end-time
+    auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+    timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+  }
+  return *std::min_element(timings.begin(), timings.end());
+}
+
+// =================================================================================================
+
+// Prints the header of the performance table
+void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
+  if (!silent) {
+    for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
+    fprintf(stdout, " | <--       CLBlast       --> | <--      clBLAS      --> |\n");
+  }
+  for (auto &argument: args) { fprintf(stdout, "%9s;", argument.c_str()); }
+  fprintf(stdout, "%9s;%9s;%9s;%9s;%9s;%9s\n",
+          "ms_1", "GFLOPS_1", "GBs_1", "ms_2", "GFLOPS_2", "GBs_2");
+}
+
+// Print a performance-result row
+void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
+                   const bool no_abbrv, const double ms_clblast, const double ms_clblas,
+                   const unsigned long long flops, const unsigned long long bytes) {
+
+  // Computes the GFLOPS and GB/s metrics
+  auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
+  auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
+  auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
+  auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;
+
+  // Outputs the argument values
+  for (auto &argument: args_int) {
+    if (!no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
+      fprintf(stdout, "%8luM;", argument/(1024*1024));
+    }
+    else if (!no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
+      fprintf(stdout, "%8luK;", argument/1024);
+    }
+    else {
+      fprintf(stdout, "%9lu;", argument);
+    }
+  }
+  for (auto &argument: args_string) {
+    fprintf(stdout, "%9s;", argument.c_str());
+  }
+
+  // Outputs the performance numbers
+  fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf;%9.2lf;%9.1lf;%9.1lf\n",
+          ms_clblast, gflops_clblast, gbs_clblast,
+          ms_clblas, gflops_clblas, gbs_clblas);
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/test/performance/client.h b/test/performance/client.h
new file mode 100644
index 00000000..2b9991fe
--- /dev/null
+++ b/test/performance/client.h
@@ -0,0 +1,85 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides common function declarations to be used with the test clients.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_PERFORMANCE_CLIENT_H_
+#define CLBLAST_TEST_PERFORMANCE_CLIENT_H_
+
+#include <string>
+#include <vector>
+
+// The libraries to test
+#include <clBLAS.h>
+#include "clblast.h"
+
+#include "internal/utilities.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Types of devices to consider
+const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
+
+// =================================================================================================
+
+// Shorthand for a BLAS routine with 2 or 3 OpenCL buffers as argument
+template <typename T>
+using Routine2 = std::function<void(const Arguments<T>&,
+                                    const Buffer&, const Buffer&,
+                                    CommandQueue&)>;
+template <typename T>
+using Routine3 = std::function<void(const Arguments<T>&,
+                                    const Buffer&, const Buffer&, const Buffer&,
+                                    CommandQueue&)>;
+
+// =================================================================================================
+
+// These are the main client functions, setting-up arguments, matrices, OpenCL buffers, etc. After
+// set-up, they call the client routine, passed as argument to this function.
+template <typename T>
+void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
+              const std::vector<std::string> &options);
+template <typename T>
+void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
+               const std::vector<std::string> &options);
+
+// =================================================================================================
+
+// Parses all command-line arguments, filling in the arguments structure. If no command-line
+// argument is given for a particular argument, it is filled in with a default value.
+template <typename T>
+Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options);
+
+// Retrieves only the precision command-line argument, since the above function is templated based
+// on the precision
+Precision GetPrecision(int argc, char *argv[]);
+
+// =================================================================================================
+
+// Runs a function a given number of times and returns the execution time of the shortest instance
+double TimedExecution(const size_t num_runs, std::function<void()> main_computation);
+
+// =================================================================================================
+
+// Prints the header of a performance-data table
+void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
+
+// Prints a row of performance data, including results of two libraries
+void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
+                   const bool abbreviations, const double ms_clblast, const double ms_clblas,
+                   const unsigned long long flops, const unsigned long long bytes);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_PERFORMANCE_CLIENT_H_
+#endif
diff --git a/test/performance/graphs/common.r b/test/performance/graphs/common.r
new file mode 100644
index 00000000..4572e559
--- /dev/null
+++ b/test/performance/graphs/common.r
@@ -0,0 +1,189 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the common performance scripts, such as creating a graph.
+#
+# ==================================================================================================
+
+# Colours
+black     = "#000000"
+grey      = "#888888"
+purplish  = "#550077" # [ 85,  0,119] lumi=26
+blueish   = "#4765b1" # [ 71,101,177] lumi=100
+redish    = "#d67568" # [214,117,104] lumi=136
+greenish  = "#9bd4ca" # [155,212,202] lumi=199
+colourset = c(blueish, redish, greenish, purplish)
+
+# Sets the graph markers (circles, triangles, etc.)
+pchs = c(15, 18, 17, 12)
+
+# Other constants
+kilo = 1024
+mega = 1024*1024
+
+# R options
+options("width"=170)
+
+# ==================================================================================================
+
+# Constants
+num_runs <- 4
+devices <- c("-platform","-device")
+options_string <- "-q -no_abbrv"
+library_names <- c("CLBlast", "clBLAS")
+
+# Command-line arguments
+command_line <- commandArgs(trailingOnly=TRUE)
+if (length(command_line) != 2) {
+  print("Usage for device Z on platform Y: Rscript xxxxx.r Y Z")
+  quit()
+}
+platform_id <- command_line[1]
+device_id <- command_line[2]
+
+# Selects the device
+devices_values <- c(platform_id, device_id)
+devices_string <- paste(devices, devices_values, collapse=" ")
+
+# ==================================================================================================
+
+# The main function
+main <- function(routine_name, precision, test_names, test_values,
+                test_xlabels, test_xaxis, metric_gflops) {
+
+  # Names
+  display_name <- toupper(routine_name)
+  if (precision == 16) { display_name <- gsub("^X","H",display_name); }
+  if (precision == 32) { display_name <- gsub("^X","S",display_name); }
+  if (precision == 64) { display_name <- gsub("^X","D",display_name); }
+  if (precision == 3232) { display_name <- gsub("^X","C",display_name); }
+  if (precision == 6464) { display_name <- gsub("^X","Z",display_name); }
+  executable <- paste("./client_", routine_name, sep="")
+
+  # Configures the outputfile
+  pdf(paste(display_name, ".pdf", sep=""), height=8, width=13)
+  par(mfrow=c(2, 3))
+  par(oma=c(0, 0, 0, 0))
+  par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
+  par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
+
+  # Loops over the test-cases
+  for (test_id in 1:length(test_names)) {
+    params_values <- test_values[[test_id]]
+
+    # Loops over the commands within a single list (within a case)
+    for (command_id in 1:length(params_values)) {
+
+      # Runs the client and captures the result
+      params_string <- paste(parameters, params_values[[command_id]], collapse=" ")
+      arguments <- paste(devices_string, params_string, options_string, sep=" ")
+      result_string <- system2(command=executable, args=arguments, stdout=TRUE)
+
+      # Reads the result into a dataframe
+      command_db <- read.csv(text=result_string, sep=";")
+
+      # Append the results to the final dataframe
+      if (command_id == 1) {
+        db <- command_db
+      } else {
+        db <- rbind(db, command_db)
+      }
+    }
+    print(db)
+
+    # Sets the values on the x-axis and their labels (test dependent)
+    if (is.character(test_xaxis[[test_id]][[1]])) {
+      xdata <- db[,test_xaxis[[test_id]][[1]]]
+      xtics <- xdata
+      log_scale <- test_xaxis[[test_id]][[2]]
+    }
+    else {
+      xdata <- test_xaxis[[test_id]][[1]]
+      xtics <- test_xaxis[[test_id]][[2]]
+      log_scale <- ""
+    }
+
+    # Plots the graph with GFLOPS on the Y-axis
+    if (metric_gflops) {
+      plot_graph(xdata=xdata, ydata=list(db$GFLOPS_1, db$GFLOPS_2), log_setting=log_scale,
+                 xmin=min(xdata), xmax=max(xdata),
+                 ymin=0, ymax=max(max(db$GFLOPS_1),max(db$GFLOPS_2)),
+                 xtics=xtics,
+                 xlabel=test_xlabels[[test_id]], ylabel="GFLOPS (higher is better)",
+                 graph_title=paste(display_name, test_names[[test_id]], sep=" "),
+                 multiple=50, experiment_names=library_names)
+    # Plots the graph with GB/s on the Y-axis
+    } else {
+      plot_graph(xdata=xdata, ydata=list(db$GBs_1, db$GBs_2), log_setting=log_scale,
+                 xmin=min(xdata), xmax=max(xdata),
+                 ymin=0, ymax=max(max(db$GBs_1),max(db$GBs_2)),
+                 xtics=xtics,
+                 xlabel=test_xlabels[[test_id]], ylabel="GB/s (higher is better)",
+                 graph_title=paste(display_name, test_names[[test_id]], sep=" "),
+                 multiple=10, experiment_names=library_names)
+    }
+  }
+}
+
+# ==================================================================================================
+
+# Plots data
+plot_graph <- function(xdata, ydata, log_setting,
+                       xmin, xmax, ymin, ymax,
+                       xtics, xlabel, ylabel,
+                       graph_title,
+                       multiple, experiment_names) {
+
+  # Update the ymax to the next multiple of something
+  ymax <- multiple*ceiling(ymax/multiple)
+
+  # Add kilo or mega to the x-labels
+  for (i in 1:length(xtics)) {
+    if (!is.na(as.numeric(xtics[i]))) {
+      if (as.numeric(xtics[i])%%mega == 0) {
+        xtics[i] <- paste(as.character(as.numeric(xtics[i])/mega), "M", sep="")
+      } else if (as.numeric(xtics[i])%%kilo == 0) {
+        xtics[i] <- paste(as.character(as.numeric(xtics[i])/kilo), "K", sep="")
+      }
+    }
+  }
+
+  # Creates an initial graph with axis but without data
+  par(new=F)
+  plot(x=xmin:xmax, y=rep(1, length(xmin:xmax)), log=log_setting,
+       main="", xlab="", ylab="",
+       ylim=c(ymin, ymax), xlim=c(xmin, xmax), axes=F, "n")
+  axis(side=2, las=2)
+  axis(side=1, at=xdata, labels=xtics, las=2)
+  title(xlab=xlabel, line=-1)
+  title(ylab=ylabel, line=2)
+  title(graph_title, line=-2)
+  par(new=T)
+
+  # Loops over all experiments
+  num_experiments <- length(ydata)
+  for (id in 1:num_experiments) {
+
+    # Plots the data for this experiment
+    plot(x=xdata, y=ydata[[id]], log=log_setting,
+         col=colourset[id], pch=pchs[id], lty=1, lwd=1, cex=1,
+         xlab="", ylab="", ylim=c(ymin, ymax), xlim=c(xmin, xmax),
+         axes=F, "b", xpd=T)
+    par(new=T)
+  }
+
+  # Add a legend
+  legend("bottomright", experiment_names,
+         lwd=1, ncol=1, col=colourset, pch=pchs, lty=1, cex=1,
+         bty="n", xpd=T)
+
+  # Done
+  par(new=F)
+}
+
+# ==================================================================================================
diff --git a/test/performance/graphs/xaxpy.r b/test/performance/graphs/xaxpy.r
new file mode 100644
index 00000000..187590aa
--- /dev/null
+++ b/test/performance/graphs/xaxpy.r
@@ -0,0 +1,96 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for the Xaxpy routine
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xaxpy"
+parameters <- c("-n","-incx","-incy",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "multiples of 256K",
+  "multiples of 256K (+1)",
+  "around n=1M",
+  "around n=16M",
+  "strides (n=8M)",
+  "powers of 2"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c(256*kilo, 1, 1, 16, 256*kilo, num_runs, precision)),
+  list(c(256*kilo+1, 1, 1, 16, 256*kilo, num_runs, precision)),
+  list(c(1*mega, 1, 1, 16, 1, num_runs, precision)),
+  list(c(16*mega, 1, 1, 16, 1, num_runs, precision)),
+  list(
+    c(8*mega, 1, 1, 1, 0, num_runs, precision),
+    c(8*mega, 2, 1, 1, 0, num_runs, precision),
+    c(8*mega, 4, 1, 1, 0, num_runs, precision),
+    c(8*mega, 8, 1, 1, 0, num_runs, precision),
+    c(8*mega, 1, 2, 1, 0, num_runs, precision),
+    c(8*mega, 1, 4, 1, 0, num_runs, precision),
+    c(8*mega, 1, 8, 1, 0, num_runs, precision),
+    c(8*mega, 2, 2, 1, 0, num_runs, precision),
+    c(8*mega, 4, 4, 1, 0, num_runs, precision),
+    c(8*mega, 8, 8, 1, 0, num_runs, precision)
+  ),
+  list(
+    c(32*kilo, 1, 1, 1, 0, num_runs, precision),
+    c(64*kilo, 1, 1, 1, 0, num_runs, precision),
+    c(128*kilo, 1, 1, 1, 0, num_runs, precision),
+    c(256*kilo, 1, 1, 1, 0, num_runs, precision),
+    c(512*kilo, 1, 1, 1, 0, num_runs, precision),
+    c(1*mega, 1, 1, 1, 0, num_runs, precision),
+    c(2*mega, 1, 1, 1, 0, num_runs, precision),
+    c(4*mega, 1, 1, 1, 0, num_runs, precision),
+    c(8*mega, 1, 1, 1, 0, num_runs, precision),
+    c(16*mega, 1, 1, 1, 0, num_runs, precision),
+    c(32*mega, 1, 1, 1, 0, num_runs, precision),
+    c(64*mega, 1, 1, 1, 0, num_runs, precision)
+  )
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "vector sizes (n)",
+  "vector sizes (n)",
+  "vector sizes (n)",
+  "vector sizes (n)",
+  "increments/strides for x and y",
+  "vector sizes (n)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  c("n", ""),
+  list(1:10, c("x1y1", "x2y1", "x4y1", "x8y1", "x1y2", "x1y4", "x1y8", "x2y2", "x4y4", "x8y8")),
+  c("n", "x")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=FALSE)
+
+# ==================================================================================================
+\ No newline at end of file
diff --git a/test/performance/graphs/xgemm.r b/test/performance/graphs/xgemm.r
new file mode 100755
index 00000000..22f63b77
--- /dev/null
+++ b/test/performance/graphs/xgemm.r
@@ -0,0 +1,94 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for the Xgemm routine
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xgemm"
+parameters <- c("-m","-n","-k","-layout","-transA","-transB",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "multiples of 128",
+  "multiples of 128 (+1)",
+  "around m=n=k=512",
+  "around m=n=k=2048",
+  "layouts and transposing (m=n=k=1024)",
+  "powers of 2"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c(128, 128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c(129, 129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c(512, 512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(
+    c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
+  ),
+  list(
+    c(8, 8, 8, 0, 0, 0, 1, 0, num_runs, precision),
+    c(16, 16, 16, 0, 0, 0, 1, 0, num_runs, precision),
+    c(32, 32, 32, 0, 0, 0, 1, 0, num_runs, precision),
+    c(64, 64, 64, 0, 0, 0, 1, 0, num_runs, precision),
+    c(128, 128, 128, 0, 0, 0, 1, 0, num_runs, precision),
+    c(256, 256, 256, 0, 0, 0, 1, 0, num_runs, precision),
+    c(512, 512, 512, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
+  )
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "matrix sizes (m=n=k)",
+  "matrix sizes (m=n=k)",
+  "matrix sizes (m=n=k)",
+  "matrix sizes (m=n=k)",
+  "layout (row/col), transA (n/y), transB (n/y)",
+  "matrix sizes (m=n=k)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("m", ""),
+  c("m", ""),
+  c("m", ""),
+  c("m", ""),
+  list(1:8, c("row,n,n", "row,n,y", "row,y,n", "row,y,y",
+              "col,n,n", "col,n,y", "col,y,n", "col,y,y")),
+  c("m", "x")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ==================================================================================================
+\ No newline at end of file
diff --git a/test/performance/graphs/xsymm.r b/test/performance/graphs/xsymm.r
new file mode 100644
index 00000000..6493f52a
--- /dev/null
+++ b/test/performance/graphs/xsymm.r
@@ -0,0 +1,94 @@
+
+# ==================================================================================================
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+# project uses a tab-size of two spaces and a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# This file implements the performance script for the Xsymm routine
+#
+# ==================================================================================================
+
+# Includes the common functions
+args <- commandArgs(trailingOnly = FALSE)
+thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
+source(file.path(dirname(thisfile), "common.r"))
+
+# ==================================================================================================
+
+# Settings
+routine_name <- "xsymm"
+parameters <- c("-m","-n","-layout","-triangle","-side",
+                "-num_steps","-step","-runs","-precision")
+precision <- 32
+
+# Sets the names of the test-cases
+test_names <- list(
+  "multiples of 128",
+  "multiples of 128 (+1)",
+  "around m=n=512",
+  "around m=n=2048",
+  "layouts and triangle/side (m=n=1024)",
+  "powers of 2"
+)
+
+# Defines the test-cases
+test_values <- list(
+  list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
+  list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
+  list(
+    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
+  ),
+  list(
+    c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
+    c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
+    c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
+    c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
+    c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
+    c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
+    c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
+    c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
+    c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
+    c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
+    c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
+  )
+)
+
+# Defines the x-labels corresponding to the test-cases
+test_xlabels <- list(
+  "matrix sizes (m=n)",
+  "matrix sizes (m=n)",
+  "matrix sizes (m=n)",
+  "matrix sizes (m=n)",
+  "layout (row/col), triangle (up/lo), side (l/r)",
+  "matrix sizes (m=n)"
+)
+
+# Defines the x-axis of the test-cases
+test_xaxis <- list(
+  c("m", ""),
+  c("m", ""),
+  c("m", ""),
+  c("m", ""),
+  list(1:8, c("row,up,l", "row,up,r", "row,lo,l", "row,lo,r",
+              "col,up,l", "col,up,r", "col,lo,l", "col,lo,r")),
+  c("m", "x")
+)
+
+# ==================================================================================================
+
+# Start the script
+main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
+     test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
+
+# ==================================================================================================
+\ No newline at end of file
diff --git a/test/performance/routines/xaxpy.cc b/test/performance/routines/xaxpy.cc
new file mode 100644
index 00000000..23d76099
--- /dev/null
+++ b/test/performance/routines/xaxpy.cc
@@ -0,0 +1,97 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy command-line interface tester.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <exception>
+
+#include "wrapper_clblas.h"
+#include "performance/client.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The client, used for performance testing. It contains the function calls to CLBlast and to other
+// libraries to compare against.
+template <typename T>
+void PerformanceXaxpy(const Arguments<T> &args,
+                      const Buffer &x_vec, const Buffer &y_vec,
+                      CommandQueue &queue) {
+
+  // Creates the CLBlast lambda
+  auto clblast_lambda = [&args, &x_vec, &y_vec, &queue]() {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Axpy(args.n, args.alpha,
+                       x_vec(), args.x_offset, args.x_inc,
+                       y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    if (status != StatusCode::kSuccess) {
+      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
+    }
+  };
+
+  // Creates the clBLAS lambda (for comparison)
+  auto clblas_lambda = [&args, &x_vec, &y_vec, &queue]() {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXaxpy(args.n, args.alpha,
+                              x_vec(), args.x_offset, args.x_inc,
+                              y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
+    }
+  };
+
+  // Runs the routines and collect the timings
+  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
+  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
+
+  // Prints the performance of both libraries
+  const auto flops = 2 * args.n;
+  const auto bytes = (3 * args.n) * sizeof(T);
+  const auto output_ints = std::vector<size_t>{args.n, args.x_inc, args.y_inc,
+                                               args.x_offset, args.y_offset};
+  const auto output_strings = std::vector<std::string>{ToString(args.alpha)};
+  PrintTableRow(output_ints, output_strings, args.no_abbrv,
+                ms_clblast, ms_clblas, flops, bytes);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void ClientXaxpy(int argc, char *argv[]) {
+  const auto o = std::vector<std::string>{kArgN, kArgXInc, kArgYInc,
+                                          kArgXOffset, kArgYOffset, kArgAlpha};
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: ClientXY<float>(argc, argv, PerformanceXaxpy<float>, o); break;
+    case Precision::kDouble: ClientXY<double>(argc, argv, PerformanceXaxpy<double>, o); break;
+    case Precision::kComplexSingle: ClientXY<float2>(argc, argv, PerformanceXaxpy<float2>, o); break;
+    case Precision::kComplexDouble: ClientXY<double2>(argc, argv, PerformanceXaxpy<double2>, o); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::ClientXaxpy(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/xgemm.cc b/test/performance/routines/xgemm.cc
new file mode 100644
index 00000000..234e9fdb
--- /dev/null
+++ b/test/performance/routines/xgemm.cc
@@ -0,0 +1,115 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm command-line interface tester.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <exception>
+
+#include "wrapper_clblas.h"
+#include "performance/client.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The client, used for performance testing. It contains the function calls to CLBlast and to other
+// libraries to compare against.
+template <typename T>
+void PerformanceXgemm(const Arguments<T> &args,
+                      const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
+                      CommandQueue &queue) {
+
+  // Creates the CLBlast lambda
+  auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
+                       args.m, args.n, args.k,
+                       args.alpha,
+                       a_mat(), args.a_offset, args.a_ld,
+                       b_mat(), args.b_offset, args.b_ld,
+                       args.beta,
+                       c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    if (status != StatusCode::kSuccess) {
+      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
+    }
+  };
+
+  // Creates the clBLAS lambda (for comparison)
+  auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              static_cast<clblasTranspose>(args.b_transpose),
+                              args.m, args.n, args.k,
+                              args.alpha,
+                              a_mat(), args.a_offset, args.a_ld,
+                              b_mat(), args.b_offset, args.b_ld,
+                              args.beta,
+                              c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
+    }
+  };
+
+  // Runs the routines and collect the timings
+  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
+  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
+
+  // Prints the performance of both libraries
+  const auto flops = 2 * args.m * args.n * args.k;
+  const auto bytes = (args.m*args.k + args.k*args.n + args.m*args.n) * sizeof(T);
+  const auto output_ints = std::vector<size_t>{args.m, args.n, args.k,
+                                               static_cast<size_t>(args.layout),
+                                               static_cast<size_t>(args.a_transpose),
+                                               static_cast<size_t>(args.b_transpose),
+                                               args.a_ld, args.b_ld, args.c_ld,
+                                               args.a_offset, args.b_offset, args.c_offset};
+  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
+                                                       ToString(args.beta)};
+  PrintTableRow(output_ints, output_strings, args.no_abbrv,
+                ms_clblast, ms_clblas, flops, bytes);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void ClientXgemm(int argc, char *argv[]) {
+  const auto o = std::vector<std::string>{kArgM, kArgN, kArgK, kArgLayout,
+                                          kArgATransp, kArgBTransp,
+                                          kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+                                          kArgAOffset, kArgBOffset, kArgCOffset,
+                                          kArgAlpha, kArgBeta};
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXgemm<float>, o); break;
+    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXgemm<double>, o); break;
+    case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode");
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::ClientXgemm(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/xsymm.cc b/test/performance/routines/xsymm.cc
new file mode 100644
index 00000000..13ad434a
--- /dev/null
+++ b/test/performance/routines/xsymm.cc
@@ -0,0 +1,115 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm command-line interface tester.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <exception>
+
+#include "wrapper_clblas.h"
+#include "performance/client.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The client, used for performance testing. It contains the function calls to CLBlast and to other
+// libraries to compare against.
+template <typename T>
+void PerformanceXsymm(const Arguments<T> &args,
+                      const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
+                      CommandQueue &queue) {
+
+  // Creates the CLBlast lambda
+  auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Symm(args.layout, args.side, args.triangle,
+                       args.m, args.n,
+                       args.alpha,
+                       a_mat(), args.a_offset, args.a_ld,
+                       b_mat(), args.b_offset, args.b_ld,
+                       args.beta,
+                       c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    if (status != StatusCode::kSuccess) {
+      throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
+    }
+  };
+
+  // Creates the clBLAS lambda (for comparison)
+  auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasSide>(args.side),
+                              static_cast<clblasUplo>(args.triangle),
+                              args.m, args.n,
+                              args.alpha,
+                              a_mat(), args.a_offset, args.a_ld,
+                              b_mat(), args.b_offset, args.b_ld,
+                              args.beta,
+                              c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+      throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
+    }
+  };
+
+  // Runs the routines and collect the timings
+  auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
+  auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
+
+  // Prints the performance of both libraries
+  const auto flops = 2 * args.m * args.n * args.m;
+  const auto bytes = (args.m*args.m + args.m*args.n + args.m*args.n) * sizeof(T);
+  const auto output_ints = std::vector<size_t>{args.m, args.n,
+                                               static_cast<size_t>(args.layout),
+                                               static_cast<size_t>(args.triangle),
+                                               static_cast<size_t>(args.side),
+                                               args.a_ld, args.b_ld, args.c_ld,
+                                               args.a_offset, args.b_offset, args.c_offset};
+  const auto output_strings = std::vector<std::string>{ToString(args.alpha),
+                                                       ToString(args.beta)};
+  PrintTableRow(output_ints, output_strings, args.no_abbrv,
+                ms_clblast, ms_clblas, flops, bytes);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void ClientXsymm(int argc, char *argv[]) {
+  const auto o = std::vector<std::string>{kArgM, kArgN, kArgLayout,
+                                          kArgTriangle, kArgSide,
+                                          kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+                                          kArgAOffset, kArgBOffset, kArgCOffset,
+                                          kArgAlpha, kArgBeta};
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsymm<float>, o); break;
+    case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsymm<double>, o); break;
+    case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode");
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::ClientXsymm(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h
new file mode 100644
index 00000000..7c71fcaa
--- /dev/null
+++ b/test/wrapper_clblas.h
@@ -0,0 +1,216 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under the MIT license. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a wrapper around the clBLAS library, such that its routines can be called
+// in a similar way as the CLBlast routines: using alpha and beta to determine the precision.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_WRAPPER_CLBLAS_H_
+#define CLBLAST_TEST_WRAPPER_CLBLAS_H_
+
+#include <clBLAS.h>
+
+#include "internal/utilities.h"
+
+namespace clblast {
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+
+// Calls {clblasSaxpy, clblasDaxpy, clblasCaxpy, clblasZaxpy} with the arguments forwarded.
+clblasStatus clblasXaxpy(
+  size_t n, float alpha,
+  const cl_mem x_vec, size_t x_offset, size_t x_inc,
+  const cl_mem y_vec, size_t y_offset, size_t y_inc,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasSaxpy(n, alpha,
+                       x_vec, x_offset, static_cast<int>(x_inc),
+                       y_vec, y_offset, static_cast<int>(y_inc),
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXaxpy(
+  size_t n, double alpha,
+  const cl_mem x_vec, size_t x_offset, size_t x_inc,
+  const cl_mem y_vec, size_t y_offset, size_t y_inc,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasDaxpy(n, alpha,
+                       x_vec, x_offset, static_cast<int>(x_inc),
+                       y_vec, y_offset, static_cast<int>(y_inc),
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXaxpy(
+  size_t n, float2 alpha,
+  const cl_mem x_vec, size_t x_offset, size_t x_inc,
+  const cl_mem y_vec, size_t y_offset, size_t y_inc,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
+    return clblasCaxpy(n, cl_alpha,
+                       x_vec, x_offset, static_cast<int>(x_inc),
+                       y_vec, y_offset, static_cast<int>(y_inc),
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXaxpy(
+  size_t n, double2 alpha,
+  const cl_mem x_vec, size_t x_offset, size_t x_inc,
+  const cl_mem y_vec, size_t y_offset, size_t y_inc,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
+    return clblasZaxpy(n, cl_alpha,
+                       x_vec, x_offset, static_cast<int>(x_inc),
+                       y_vec, y_offset, static_cast<int>(y_inc),
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+
+// This calls {clblasSgemm, clblasDgemm, clblasCgemm, clblasZgemm} with the arguments forwarded.
+clblasStatus clblasXgemm(
+  clblasOrder layout, clblasTranspose tran_a, clblasTranspose tran_b,
+  size_t m, size_t n, size_t k, float alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasSgemm(layout, tran_a, tran_b,
+                       m, n, k, alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld, beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXgemm(
+  clblasOrder layout, clblasTranspose tran_a, clblasTranspose tran_b,
+  size_t m, size_t n, size_t k, double alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasDgemm(layout, tran_a, tran_b,
+                       m, n, k, alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld, beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXgemm(
+  clblasOrder layout, clblasTranspose tran_a, clblasTranspose tran_b,
+  size_t m, size_t n, size_t k, float2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
+    return clblasCgemm(layout, tran_a, tran_b,
+                       m, n, k, cl_alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld, cl_beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXgemm(
+  clblasOrder layout, clblasTranspose tran_a, clblasTranspose tran_b,
+  size_t m, size_t n, size_t k, double2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
+    return clblasZgemm(layout, tran_a, tran_b,
+                       m, n, k, cl_alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld, cl_beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+
+// This calls {clblasSsymm, clblasDsymm} with the arguments forwarded.
+clblasStatus clblasXsymm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  size_t m, size_t n, float alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasSsymm(layout, side, triangle,
+                       m, n, alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld, beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsymm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  size_t m, size_t n, double alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    return clblasDsymm(layout, side, triangle,
+                       m, n, alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld, beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsymm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  size_t m, size_t n, float2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
+    return clblasCsymm(layout, side, triangle,
+                       m, n, cl_alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld, cl_beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+clblasStatus clblasXsymm(
+  clblasOrder layout, clblasSide side, clblasUplo triangle,
+  size_t m, size_t n, double2 alpha,
+  const cl_mem a_mat, size_t a_offset, size_t a_ld,
+  const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta,
+  cl_mem c_mat, size_t c_offset, size_t c_ld,
+  cl_uint num_queues, cl_command_queue *queues,
+  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+    auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
+    auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
+    return clblasZsymm(layout, side, triangle,
+                       m, n, cl_alpha,
+                       a_mat, a_offset, a_ld,
+                       b_mat, b_offset, b_ld, cl_beta,
+                       c_mat, c_offset, c_ld,
+                       num_queues, queues, num_wait_events, wait_events, events);
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_WRAPPER_CLBLAS_H_
+#endif