Initial commit of preview version

author: CNugteren <web@cedricnugteren.nl> 2015-05-30 12:30:43 +0200
committer: CNugteren <web@cedricnugteren.nl> 2015-05-30 12:30:43 +0200
commit: bc5a341dfe591946e925db315fc7d8c0c25c2938 (patch)
tree: b216ab5eee4863e3807d92b5ddd19fa22197ed22 /src
parent: c7b054ea6747039f4405fd93da6e924f3e5c7f4b (diff)
21 files changed, 3477 insertions, 0 deletions
diff --git a/src/clblast.cc b/src/clblast.cc
new file mode 100644
index 00000000..72de3b24
--- /dev/null
+++ b/src/clblast.cc
@@ -0,0 +1,224 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements all the BLAS API calls. In all cases, it does not much more than creating
+// a new object of the appropriate type, and calling the main routine on that object. It forwards
+// all status codes to the caller.
+//
+// =================================================================================================
+
+#include <string>
+
+#include "clblast.h"
+
+// BLAS level-1 includes
+#include "internal/routines/xaxpy.h"
+
+// BLAS level-3 includes
+#include "internal/routines/xgemm.h"
+#include "internal/routines/xsymm.h"
+
+namespace clblast {
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+
+// AXPY
+template <typename T>
+StatusCode Axpy(const size_t n, const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto xaxpy = Xaxpy<T>(queue_cpp, event_cpp);
+
+  // Loads the kernel source-code as an include (C++11 raw string literal)
+  std::string kernel_source =
+  #include "kernels/xaxpy.opencl"
+  auto status = xaxpy.SetUp(kernel_source);
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return xaxpy.DoAxpy(n, alpha,
+                      Buffer(x_buffer), x_offset, x_inc,
+                      Buffer(y_buffer), y_offset, y_inc);
+}
+template StatusCode Axpy<float>(const size_t, const float,
+                                const cl_mem, const size_t, const size_t,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Axpy<double>(const size_t, const double,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Axpy<float2>(const size_t, const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Axpy<double2>(const size_t, const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+
+// GEMM
+template <typename T>
+StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
+                const size_t m, const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto xgemm = Xgemm<T>(queue_cpp, event_cpp);
+
+  // Loads the kernel source-code as an include (C++11 raw string literal)
+  std::string common_source1 =
+  #include "kernels/copy.opencl"
+  std::string common_source2 =
+  #include "kernels/pad.opencl"
+  std::string common_source3 =
+  #include "kernels/transpose.opencl"
+  std::string common_source4 =
+  #include "kernels/padtranspose.opencl"
+  std::string kernel_source =
+  #include "kernels/xgemm.opencl"
+  auto status = xgemm.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
+                            kernel_source);
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return xgemm.DoGemm(layout, transpose_a, transpose_b,
+                      m, n, k,
+                      alpha,
+                      Buffer(a_buffer), a_offset, a_ld,
+                      Buffer(b_buffer), b_offset, b_ld,
+                      beta,
+                      Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Gemm<float>(const Layout, const Transpose, const Transpose,
+                                const size_t, const size_t, const size_t,
+                                const float,
+                                const cl_mem, const size_t, const size_t,
+                                const cl_mem, const size_t, const size_t,
+                                const float,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Gemm<double>(const Layout, const Transpose, const Transpose,
+                                 const size_t, const size_t, const size_t,
+                                 const double,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t,
+                                 const double,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+/*
+template StatusCode Gemm<float2>(const Layout, const Transpose, const Transpose,
+                                 const size_t, const size_t, const size_t,
+                                 const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t,
+                                 const float2,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Gemm<double2>(const Layout, const Transpose, const Transpose,
+                                  const size_t, const size_t, const size_t,
+                                  const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t,
+                                  const double2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+*/
+
+// =================================================================================================
+
+// SYMM
+template <typename T>
+StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = CommandQueue(*queue);
+  auto event_cpp = Event(*event);
+  auto xsymm = Xsymm<T>(queue_cpp, event_cpp);
+
+  // Loads the kernel source-code as an include (C++11 raw string literal)
+  std::string common_source1 =
+  #include "kernels/copy.opencl"
+  std::string common_source2 =
+  #include "kernels/pad.opencl"
+  std::string common_source3 =
+  #include "kernels/transpose.opencl"
+  std::string common_source4 =
+  #include "kernels/padtranspose.opencl"
+  std::string kernel_source =
+  #include "kernels/xgemm.opencl"
+  auto status = xsymm.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
+                            kernel_source);
+  if (status != StatusCode::kSuccess) { return status; }
+
+  // Runs the routine
+  return xsymm.DoSymm(layout, side, triangle,
+                      m, n,
+                      alpha,
+                      Buffer(a_buffer), a_offset, a_ld,
+                      Buffer(b_buffer), b_offset, b_ld,
+                      beta,
+                      Buffer(c_buffer), c_offset, c_ld);
+}
+template StatusCode Symm<float>(const Layout, const Side, const Triangle,
+                                const size_t, const size_t,
+                                const float,
+                                const cl_mem, const size_t, const size_t,
+                                const cl_mem, const size_t, const size_t,
+                                const float,
+                                cl_mem, const size_t, const size_t,
+                                cl_command_queue*, cl_event*);
+template StatusCode Symm<double>(const Layout, const Side, const Triangle,
+                                 const size_t, const size_t,
+                                 const double,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t,
+                                 const double,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+/*
+template StatusCode Symm<float2>(const Layout, const Side, const Triangle,
+                                 const size_t, const size_t,
+                                 const float2,
+                                 const cl_mem, const size_t, const size_t,
+                                 const cl_mem, const size_t, const size_t,
+                                 const float2,
+                                 cl_mem, const size_t, const size_t,
+                                 cl_command_queue*, cl_event*);
+template StatusCode Symm<double2>(const Layout, const Side, const Triangle,
+                                  const size_t, const size_t,
+                                  const double2,
+                                  const cl_mem, const size_t, const size_t,
+                                  const cl_mem, const size_t, const size_t,
+                                  const double2,
+                                  cl_mem, const size_t, const size_t,
+                                  cl_command_queue*, cl_event*);
+*/
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database.cc b/src/database.cc
new file mode 100644
index 00000000..beaa122b
--- /dev/null
+++ b/src/database.cc
@@ -0,0 +1,112 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Database class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/database.h"
+#include "internal/database/xaxpy.h"
+#include "internal/database/xgemm.h"
+#include "internal/database/copy.h"
+#include "internal/database/pad.h"
+#include "internal/database/transpose.h"
+#include "internal/database/padtranspose.h"
+
+#include "internal/utilities.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Initializes the database
+const std::vector<Database::DatabaseEntry> Database::database = {
+  XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
+  XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
+  CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
+  PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
+  TraSingle, TraDouble, TraComplexSingle, TraComplexDouble,
+  PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble
+};
+
+// =================================================================================================
+
+// Constructor, computing device properties and populating the parameter-vector from the database
+Database::Database(const CommandQueue &queue, const std::vector<std::string> &kernels,
+                   const Precision precision):
+  parameters_{} {
+
+  // Finds information of the current device
+  auto device = queue.GetDevice();
+  auto device_type = device.Type();
+  auto device_vendor = device.Vendor();
+  auto device_name = device.Name();
+
+  // Iterates over all kernels to include, and retrieves the parameters for each of them
+  for (auto &kernel: kernels) {
+    auto search_result = Search(kernel, device_type, device_vendor, device_name, precision);
+    parameters_.insert(search_result.begin(), search_result.end());
+  }
+}
+
+// =================================================================================================
+
+// Returns a list of OpenCL pre-processor defines in string form
+std::string Database::GetDefines() const {
+  std::string defines{};
+  for (auto &parameter: parameters_) {
+    defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n";
+  }
+  return defines;
+}
+
+// =================================================================================================
+
+// Searches the database for the right kernel and precision
+Database::Parameters Database::Search(const std::string &this_kernel,
+                                      const cl_device_type this_type,
+                                      const std::string &this_vendor,
+                                      const std::string &this_device,
+                                      const Precision this_precision) const {
+  for (auto &db: database) {
+    if (db.kernel == this_kernel && db.precision == this_precision) {
+
+      // Searches for the right vendor and device type, or selects the default if unavailable. This
+      // assumes that the default vendor / device type is last in the database.
+      for (auto &vendor: db.vendors) {
+        if (VendorEqual(vendor.name, this_vendor) &&
+            (vendor.type == this_type || vendor.type == CL_DEVICE_TYPE_ALL)) {
+
+          // Searches for the right device. If the current device is unavailable, selects the vendor
+          // default parameters. This assumes the default is last in the database.
+          for (auto &device: vendor.devices) {
+            if (device.name == this_device || device.name == kDefault) {
+
+              // Sets the parameters accordingly
+              return device.parameters;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // If we reached this point, something is wrong
+  throw std::runtime_error("Database error, could not find a suitable entry");
+}
+
+// Determines the equality between two vendor names. This is implemented because vendor names can
+// be ambigious and might change between different SDK or driver versions.
+bool Database::VendorEqual(const std::string &db_vendor, const std::string &cl_vendor) const {
+  if (db_vendor == kDefault) { return true; }
+  if (db_vendor == cl_vendor) { return true; }
+  return false;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
new file mode 100644
index 00000000..154265e4
--- /dev/null
+++ b/src/kernels/common.opencl
@@ -0,0 +1,120 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common defines and type-defs for the CLBlast OpenCL kernels.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this file is used outside of the CLBlast library.
+#ifndef PRECISION
+  #define PRECISION 32      // Data-types: single or double precision, complex or regular
+#endif
+
+// =================================================================================================
+
+// Enable support for double-precision
+#if PRECISION == 64 || PRECISION == 6464
+  #if __OPENCL_VERSION__ <= CL_VERSION_1_1
+     #pragma OPENCL EXTENSION cl_khr_fp64: enable
+  #endif
+#endif
+
+// Single-precision
+#if PRECISION == 32
+  typedef float real;
+  typedef float2 real2;
+  typedef float4 real4;
+  typedef float8 real8;
+  typedef float16 real16;
+  #define ZERO 0.0f
+
+// Double-precision 
+#elif PRECISION == 64
+  typedef double real;
+  typedef double2 real2;
+  typedef double4 real4;
+  typedef double8 real8;
+  typedef double16 real16;
+  #define ZERO 0.0
+
+// Complex single-precision
+#elif PRECISION == 3232
+  typedef struct cfloat {float x; float y;} real;
+  typedef struct cfloat2 {real x; real y;} real2;
+  typedef struct cfloat4 {real x; real y; real z; real w;} real4;
+  typedef struct cfloat8 {real s0; real s1; real s2; real s3;
+                          real s4; real s5; real s6; real s7;} real8;
+  typedef struct cfloat16 {real s0; real s1; real s2; real s3;
+                           real s4; real s5; real s6; real s7;
+                           real s8; real s9; real sA; real sB;
+                           real sC; real sD; real sE; real sF;} real16;
+  #define ZERO 0.0f
+
+// Complex Double-precision
+#elif PRECISION == 6464
+  typedef struct cdouble {double x; double y;} real;
+  typedef struct cdouble2 {real x; real y;} real2;
+  typedef struct cdouble4 {real x; real y; real z; real w;} real4;
+  typedef struct cdouble8 {real s0; real s1; real s2; real s3;
+                           real s4; real s5; real s6; real s7;} real8;
+  typedef struct cdouble16 {real s0; real s1; real s2; real s3;
+                            real s4; real s5; real s6; real s7;
+                            real s8; real s9; real sA; real sB;
+                            real sC; real sD; real sE; real sF;} real16;
+  #define ZERO 0.0
+#endif
+
+// =================================================================================================
+
+// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction
+#define USE_CL_MAD 0
+
+// Sets a variable to zero
+#if PRECISION == 3232 || PRECISION == 6464
+  #define SetToZero(a) a.x = ZERO; a.y = ZERO
+#else
+  #define SetToZero(a) a = ZERO
+#endif
+
+// Multiply two complex variables (used in the define below)
+#if PRECISION == 3232 || PRECISION == 6464
+  #define MulReal(a, b) a.x*b.x - a.y*b.y
+  #define MulImag(a, b) a.x*b.y + a.y*b.x
+#endif
+
+// The scalar multiply-add function
+#if PRECISION == 3232 || PRECISION == 6464
+  #define MultiplyAdd(c, a, b) c.x += MulReal(a,b); c.y += MulImag(a,b)
+#else
+  #if USE_CL_MAD == 1
+    #define MultiplyAdd(c, a, b) c = mad(a, b, c)
+  #else
+    #define MultiplyAdd(c, a, b) c += a * b
+  #endif
+#endif
+
+// The scalar AXPBY function
+#if PRECISION == 3232 || PRECISION == 6464
+  #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
+#else
+  #define AXPBY(e, a, b, c, d) e = a*b + c*d
+#endif
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/copy.opencl b/src/kernels/copy.opencl
new file mode 100644
index 00000000..f95b476b
--- /dev/null
+++ b/src/kernels/copy.opencl
@@ -0,0 +1,73 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS routines. This file contains
+// kernels to copy matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef COPY_DIMX
+  #define COPY_DIMX 8      // Local workgroup size in the first dimension (x)
+#endif
+#ifndef COPY_DIMY
+  #define COPY_DIMY 8      // Local workgroup size in the second dimension (y)
+#endif
+#ifndef COPY_WPT
+  #define COPY_WPT 1       // Work per thread in the first dimension (x)
+#endif
+#ifndef COPY_VW
+  #define COPY_VW 1        // Vector width in the second dimension (y)
+#endif
+
+// =================================================================================================
+
+// Data-widths
+#if COPY_VW == 1
+  typedef real realC;
+#elif COPY_VW == 2
+  typedef real2 realC;
+#elif COPY_VW == 4
+  typedef real4 realC;
+#elif COPY_VW == 8
+  typedef real8 realC;
+#elif COPY_VW == 16
+  typedef real16 realC;
+#endif
+
+// =================================================================================================
+
+// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
+// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
+__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+__kernel void CopyMatrix(const int ld,
+                         __global const realC* restrict src,
+                         __global realC* dest) {
+  #pragma unroll
+  for (int w_one=0; w_one<COPY_WPT; ++w_one) {
+    const int id_one = get_global_id(0);
+    const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
+    const int id = id_two*(ld/COPY_VW) + id_one;
+    dest[id] = src[id];
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/pad.opencl b/src/kernels/pad.opencl
new file mode 100644
index 00000000..ccaeb9d6
--- /dev/null
+++ b/src/kernels/pad.opencl
@@ -0,0 +1,180 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS routines. This file contains
+// kernels to copy and pad matrices in various ways, including:
+// 1) copying into a larger matrix by adding padding
+// 2) copying into a smaller matrix by removing padding
+// 3) from upper/lower triangle into a full matrix
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef PAD_DIMX
+  #define PAD_DIMX 8      // Local workgroup size in the first dimension (x)
+#endif
+#ifndef PAD_DIMY
+  #define PAD_DIMY 8      // Local workgroup size in the second dimension (y)
+#endif
+#ifndef PAD_WPTX
+  #define PAD_WPTX 1      // Work per thread in the first dimension (x)
+#endif
+#ifndef PAD_WPTY
+  #define PAD_WPTY 1      // Work per thread in the second dimension (y)
+#endif
+
+// =================================================================================================
+
+// Copies a matrix from source to destination. The output is padded with zero values in case the
+// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
+// value and offset can be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void PadMatrix(const int src_one, const int src_two,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_one, const int dest_two,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_two && id_one < dest_one) {
+
+        // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the
+        // value to be written to zero.
+        real value;
+        SetToZero(value);
+        if (id_two < src_two && id_one < src_one) {
+          value = src[id_two*src_ld + id_one + src_offset];
+        }
+
+        // Stores the value in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
+// writes only the actual data back to the destination matrix. Again, the ld value and offset can
+// be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void UnPadMatrix(const int src_one, const int src_two,
+                          const int src_ld, const int src_offset,
+                          __global const real* restrict src,
+                          const int dest_one, const int dest_two,
+                          const int dest_ld, const int dest_offset,
+                          __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_two && id_one < dest_one) {
+
+        // Copies the value into the destination matrix. This is always within bounds of the source
+        // matrix, as we know that the destination matrix is smaller than the source.
+        dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-symmetric matrix
+        real value;
+        SetToZero(value);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) { value = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { value = src[id_one*src_ld + id_two + src_offset]; }
+        }
+
+        // Stores the value in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-symmetric matrix
+        real value;
+        SetToZero(value);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) { value = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { value = src[id_one*src_ld + id_two + src_offset]; }
+        }
+
+        // Stores the value in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/padtranspose.opencl b/src/kernels/padtranspose.opencl
new file mode 100644
index 00000000..67cbf341
--- /dev/null
+++ b/src/kernels/padtranspose.opencl
@@ -0,0 +1,150 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// kernels to transpose matrices in various ways, including:
+// 1) transposing into a larger matrix by adding padding
+// 2) transposing into a smaller matrix by removing padding
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef PADTRA_TILE
+  #define PADTRA_TILE 8   // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef PADTRA_WPT
+  #define PADTRA_WPT 1    // Amount of work per thread
+#endif
+#ifndef PADTRA_PAD
+  #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
+#endif
+
+// =================================================================================================
+
+// Same as PadCopyMatrix, but now also does the transpose
+__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+__kernel void PadTransposeMatrix(const int src_one, const int src_two,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_one, const int dest_two,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the source matrix. Note that the local and global dimensions
+      // do not correspond to each other!
+      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
+      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
+
+      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
+      // Otherwise, set the local memory value to zero.
+      real value;
+      SetToZero(value);
+      if (id_src_two < src_two && id_src_one < src_one) {
+        value = src[id_src_two*src_ld + id_src_one + src_offset];
+      }
+      tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+    }
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the destination matrix
+      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
+      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
+
+      // Stores the transposed value in the destination matrix
+      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// Same as UnPadCopyMatrix, but now also does the transpose
+__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+__kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
+                                   const int src_ld, const int src_offset,
+                                   __global const real* restrict src,
+                                   const int dest_one, const int dest_two,
+                                   const int dest_ld, const int dest_offset,
+                                   __global real* dest) {
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the source matrix. Note that the local and global dimensions
+      // do not correspond to each other!
+      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
+      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
+
+      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
+      if ((id_src_one < src_one) && (id_src_two < src_two)) {
+        real value = src[id_src_two*src_ld + id_src_one + src_offset];
+        tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+      }
+    }
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the destination matrix
+      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
+      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
+
+      // Stores the transposed value in the destination matrix
+      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/transpose.opencl b/src/kernels/transpose.opencl
new file mode 100644
index 00000000..79ab1688
--- /dev/null
+++ b/src/kernels/transpose.opencl
@@ -0,0 +1,168 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// kernels to transpose matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef TRA_DIM
+  #define TRA_DIM 8    // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef TRA_WPT
+  #define TRA_WPT 1    // Work per thread in one dimension and vector-width in the other
+#endif
+#ifndef TRA_PAD
+  #define TRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
+#endif
+
+// =================================================================================================
+
+// Data-widths
+#if TRA_WPT == 1
+  typedef real realT;
+#elif TRA_WPT == 2
+  typedef real2 realT;
+#elif TRA_WPT == 4
+  typedef real4 realT;
+#elif TRA_WPT == 8
+  typedef real8 realT;
+#elif TRA_WPT == 16
+  typedef real16 realT;
+#endif
+
+// =================================================================================================
+
+// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
+// offset. A more general version is available in 'padtranspose.opencl'.
+__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+__kernel void TransposeMatrix(const int ld,
+                              __global const realT* restrict src,
+                              __global realT* dest) {
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[TRA_WPT*TRA_DIM][TRA_WPT*TRA_DIM + TRA_PAD];
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
+
+    // Computes the identifiers for the source matrix. Note that the local and global dimensions
+    // do not correspond to each other!
+    const int id_one = get_group_id(1) * TRA_DIM + get_local_id(0);
+    const int id_two = (get_group_id(0) * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
+
+    // Loads data into the local memory
+    realT value = src[id_two*(ld/TRA_WPT) + id_one];
+    #if TRA_WPT == 1
+      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value;
+    #elif TRA_WPT == 2
+      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
+      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
+    #elif TRA_WPT == 4
+      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
+      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
+      tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.z;
+      tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.w;
+    #elif TRA_WPT == 8
+      tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
+      tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
+      tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
+      tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
+      tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
+      tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
+      tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
+      tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
+    #elif TRA_WPT == 16
+      tile[get_local_id(1)*TRA_WPT +  0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
+      tile[get_local_id(1)*TRA_WPT +  1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
+      tile[get_local_id(1)*TRA_WPT +  2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
+      tile[get_local_id(1)*TRA_WPT +  3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
+      tile[get_local_id(1)*TRA_WPT +  4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
+      tile[get_local_id(1)*TRA_WPT +  5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
+      tile[get_local_id(1)*TRA_WPT +  6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
+      tile[get_local_id(1)*TRA_WPT +  7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
+      tile[get_local_id(1)*TRA_WPT +  8][get_local_id(0)*TRA_WPT + w_one] = value.s8;
+      tile[get_local_id(1)*TRA_WPT +  9][get_local_id(0)*TRA_WPT + w_one] = value.s9;
+      tile[get_local_id(1)*TRA_WPT + 10][get_local_id(0)*TRA_WPT + w_one] = value.sA;
+      tile[get_local_id(1)*TRA_WPT + 11][get_local_id(0)*TRA_WPT + w_one] = value.sB;
+      tile[get_local_id(1)*TRA_WPT + 12][get_local_id(0)*TRA_WPT + w_one] = value.sC;
+      tile[get_local_id(1)*TRA_WPT + 13][get_local_id(0)*TRA_WPT + w_one] = value.sD;
+      tile[get_local_id(1)*TRA_WPT + 14][get_local_id(0)*TRA_WPT + w_one] = value.sE;
+      tile[get_local_id(1)*TRA_WPT + 15][get_local_id(0)*TRA_WPT + w_one] = value.sF;
+    #endif
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_two=0; w_two<TRA_WPT; ++w_two) {
+
+    // Computes the identifiers for the destination matrix
+    const int id_one = get_global_id(0);
+    const int id_two = get_global_id(1)*TRA_WPT + w_two;
+
+    // Stores the transposed value in the destination matrix
+    realT value;
+    #if TRA_WPT == 1
+      value = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+    #elif TRA_WPT == 2
+      value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+      value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
+    #elif TRA_WPT == 4
+      value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+      value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
+      value.z = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
+      value.w = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
+    #elif TRA_WPT == 8
+      value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
+      value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
+      value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
+      value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
+      value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4];
+      value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5];
+      value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6];
+      value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7];
+    #elif TRA_WPT == 16
+      value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  0];
+      value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  1];
+      value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  2];
+      value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  3];
+      value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  4];
+      value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  5];
+      value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  6];
+      value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  7];
+      value.s8 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  8];
+      value.s9 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT +  9];
+      value.sA = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 10];
+      value.sB = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 11];
+      value.sC = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 12];
+      value.sD = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 13];
+      value.sE = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 14];
+      value.sF = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 15];
+    #endif
+    dest[id_two*(ld/TRA_WPT) + id_one] = value;
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/xaxpy.opencl b/src/kernels/xaxpy.opencl
new file mode 100644
index 00000000..40c6c3bd
--- /dev/null
+++ b/src/kernels/xaxpy.opencl
@@ -0,0 +1,128 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
+// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
+// support vector data-types.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef WGS
+  #define WGS 64     // The local work-group size
+#endif
+#ifndef WPT
+  #define WPT 1      // The amount of work-per-thread
+#endif
+#ifndef VW
+  #define VW 1       // Vector width of vectors X and Y
+#endif
+
+// =================================================================================================
+
+// Data-widths
+#if VW == 1
+  typedef real realV;
+#elif VW == 2
+  typedef real2 realV;
+#elif VW == 4
+  typedef real4 realV;
+#elif VW == 8
+  typedef real8 realV;
+#elif VW == 16
+  typedef real16 realV;
+#endif
+
+// =================================================================================================
+
+// The vectorized multiply-add function
+inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) {
+  #if VW == 1
+    MultiplyAdd(cvec, aval, bvec);
+  #elif VW == 2
+    MultiplyAdd(cvec.x, aval, bvec.x);
+    MultiplyAdd(cvec.y, aval, bvec.y);
+  #elif VW == 4
+    MultiplyAdd(cvec.x, aval, bvec.x);
+    MultiplyAdd(cvec.y, aval, bvec.y);
+    MultiplyAdd(cvec.z, aval, bvec.z);
+    MultiplyAdd(cvec.w, aval, bvec.w);
+  #elif VW == 8
+    MultiplyAdd(cvec.s0, aval, bvec.s0);
+    MultiplyAdd(cvec.s1, aval, bvec.s1);
+    MultiplyAdd(cvec.s2, aval, bvec.s2);
+    MultiplyAdd(cvec.s3, aval, bvec.s3);
+    MultiplyAdd(cvec.s4, aval, bvec.s4);
+    MultiplyAdd(cvec.s5, aval, bvec.s5);
+    MultiplyAdd(cvec.s6, aval, bvec.s6);
+    MultiplyAdd(cvec.s7, aval, bvec.s7);
+  #elif VW == 16
+    MultiplyAdd(cvec.s0, aval, bvec.s0);
+    MultiplyAdd(cvec.s1, aval, bvec.s1);
+    MultiplyAdd(cvec.s2, aval, bvec.s2);
+    MultiplyAdd(cvec.s3, aval, bvec.s3);
+    MultiplyAdd(cvec.s4, aval, bvec.s4);
+    MultiplyAdd(cvec.s5, aval, bvec.s5);
+    MultiplyAdd(cvec.s6, aval, bvec.s6);
+    MultiplyAdd(cvec.s7, aval, bvec.s7);
+    MultiplyAdd(cvec.s8, aval, bvec.s8);
+    MultiplyAdd(cvec.s9, aval, bvec.s9);
+    MultiplyAdd(cvec.sA, aval, bvec.sA);
+    MultiplyAdd(cvec.sB, aval, bvec.sB);
+    MultiplyAdd(cvec.sC, aval, bvec.sC);
+    MultiplyAdd(cvec.sD, aval, bvec.sD);
+    MultiplyAdd(cvec.sE, aval, bvec.sE);
+    MultiplyAdd(cvec.sF, aval, bvec.sF);
+  #endif
+  return cvec;
+}
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void Xaxpy(const int n, const real alpha,
+                    const __global real* restrict xgm, const int x_offset, const int x_inc,
+                    __global real* ygm, const int y_offset, const int y_inc) {
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  #pragma unroll
+  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+    MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xgm[id*x_inc + x_offset]);
+  }
+}
+
+// =================================================================================================
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void XaxpyFast(const int n, const real alpha,
+                         const __global realV* restrict xgm,
+                         __global realV* ygm) {
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id = w*get_global_size(0) + get_global_id(0);
+    ygm[id] = MultiplyAddVector(ygm[id], alpha, xgm[id]);
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/kernels/xgemm.opencl b/src/kernels/xgemm.opencl
new file mode 100644
index 00000000..facaf5dc
--- /dev/null
+++ b/src/kernels/xgemm.opencl
@@ -0,0 +1,570 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
+// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
+// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
+// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
+//
+// Matrices are accessed as follows:
+// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
+// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
+// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
+//
+// Or as an image (assuming column-major)
+//       K                      
+//    o-------o                 
+//    |       |                 
+//  N | [B^T] |                 
+//    |       |                 
+//    o-------o                 
+//        K               N     
+//    o-------o        o-----o  
+//  M |  [A]  |      M | [C] |  
+//    |       |        |     |  
+//    o-------o        o-----o  
+//                              
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef MWG
+  #define MWG 8      // Tile-size in dimension M (e.g. 64, 128)
+#endif
+#ifndef NWG
+  #define NWG 8      // Tile-size in dimension N (e.g. 64, 128)
+#endif
+#ifndef KWG
+  #define KWG 8      // Tile-size in dimension K (e.g. 8, 16)
+#endif
+#ifndef MDIMC
+  #define MDIMC 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef NDIMC
+  #define NDIMC 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef MDIMA
+  #define MDIMA 8    // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
+#endif
+#ifndef NDIMB
+  #define NDIMB 8    // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
+#endif
+#ifndef KWI
+  #define KWI 1      // Unroll factor of the KWG loop (smaller or equal than KWG)
+#endif
+#ifndef VWM
+  #define VWM 1      // Vector width of matrices A and C 
+#endif
+#ifndef VWN
+  #define VWN 1      // Vector width of matrix B
+#endif
+#ifndef STRM
+  #define STRM 0     // Use strided access within a thread in the M-dimension (1) or not (0)
+#endif
+#ifndef STRN
+  #define STRN 0     // Use strided access within a thread in the N-dimension (1) or not (0)
+#endif
+#ifndef SA
+  #define SA 0       // Use local/shared memory to cache matrix A (1) or not (0)
+#endif
+#ifndef SB
+  #define SB 0       // Use local/shared memory to cache matrix B (1) or not (0)
+#endif
+
+// Helper parameters based on the above tuning parameters
+#define MWI (MWG/MDIMC)               // Work per work-item (M-dimension)
+#define NWI (NWG/NDIMC)               // Work per work-item (N-dimension)
+#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
+#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
+#define MWA (MWG/MDIMA)               // Amount of loads-per-thread for matrix A (M-dimension)
+#define KWA (KWG/KDIMA)               // Amount of loads-per-thread for matrix A (K-dimension)
+#define KWB (KWG/KDIMB)               // Amount of loads-per-thread for matrix B (K-dimension)
+#define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
+
+// Settings
+#define USE_VECTOR_MAD 0              // Unroll (0) or don't (1) unroll the vector MAD manually
+
+// =================================================================================================
+
+// Data-widths in dimension M
+#if VWM == 1
+    typedef real realM;
+#elif VWM == 2
+    typedef real2 realM;
+#elif VWM == 4
+    typedef real4 realM;
+#elif VWM == 8
+    typedef real8 realM;
+#elif VWM == 16
+    typedef real16 realM;
+#endif
+
+// Data-widths in dimension N
+#if VWN == 1
+    typedef real realN;
+#elif VWN == 2
+    typedef real2 realN;
+#elif VWN == 4
+    typedef real4 realN;
+#elif VWN == 8
+    typedef real8 realN;
+#elif VWN == 16
+    typedef real16 realN;
+#endif
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix.
+#if SA == 1
+inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
+                           const int kSizeM, const int tid, const int kwg) {
+  const int la0 = tid % MDIMA;
+  const int la1 = tid / MDIMA;
+  #pragma unroll
+  for (int mia=0; mia<MWA/VWM; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWA; ++kia) {
+
+      // Computes the indices based on strided/non-strided access
+      #if STRM == 0
+        int mg = mia + la0*(MWA/VWM);
+      #elif STRM == 1
+        int mg = la0 + mia*MDIMA;
+      #endif
+
+      // Computes the indices for the global memory
+      int kg = kia + la1*KWA;
+      int idm = mg + get_group_id(0)*(MWG/VWM);
+      int idk = kg + kwg;
+
+      // Loads the data from global memory (not transposed) into the local memory
+      alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
+    }
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 1
+inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
+                           const int kSizeN, const int tid, const int kwg) {
+  const int lb0 = tid % NDIMB;
+  const int lb1 = tid / NDIMB;
+  #pragma unroll
+  for (int kib=0; kib<KWB; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWB/VWN; ++nib) {
+
+      // Computes the indices based on strided/non-strided access
+      #if STRN == 0
+        int ng = nib + lb0*(NWB/VWN);
+      #elif STRN == 1
+        int ng = lb0 + nib*NDIMB;
+      #endif
+
+      // Computes the indices for the global memory
+      int kg = kib + lb1*KWB;
+      int idn = ng + get_group_id(1)*(NWG/VWN);
+      int idk = kg + kwg;
+
+      // Loads the data from global memory (transposed) into the local memory
+      blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
+    }
+  }
+}
+#endif
+
+// =================================================================================================
+
+// Caches global off-chip memory directly into per-thread private memory (registers). This function
+// is specific for caching the A input matrix.
+#if SA == 0
+inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
+                             const int kSizeM, const int idk, const int kwg) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+
+    // Computes the indices based on strided/non-strided access
+    #if STRM == 0
+      int mg = mi + get_local_id(0)*(MWI/VWM);
+    #elif STRM == 1
+      int mg = get_local_id(0) + mi*MDIMC;
+    #endif
+
+    // Computes the indices for the global memory
+    int idm = mg + get_group_id(0)*(MWG/VWM);
+
+    // Loads the data from global memory (not transposed) and stores into registers
+    apm[mi] = agm[idk*(kSizeM/VWM) + idm];
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 0
+inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
+                             const int kSizeN, const int idk) {
+  #pragma unroll
+  for (int ni=0; ni<NWI/VWN; ++ni) {
+
+    // Computes the indices based on strided/non-strided access
+    #if STRN == 0
+      int ng = ni + get_local_id(1)*(NWI/VWN);
+    #elif STRN == 1
+      int ng = get_local_id(1) + ni*NDIMC;
+    #endif
+
+    // Computes the indices for the global memory
+    int idn = ng + get_group_id(1)*(NWG/VWN);
+
+    // Loads the data from global memory (transposed) and stores into registers
+    bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
+  }
+}
+#endif
+
+// =================================================================================================
+
+// Caches on-chip local memory into per-thread private memory (registers). This function is specific
+// for caching the A input matrix.
+#if SA == 1
+inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+    #if STRM == 0
+      int mg = mi + get_local_id(0)*(MWI/VWM);
+    #elif STRM == 1
+      int mg = get_local_id(0) + mi*MDIMC;
+    #endif
+    apm[mi] = alm[kg*(MWG/VWM) + mg];
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 1
+inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
+  #pragma unroll
+  for (int ni=0; ni<NWI/VWN; ++ni) {
+    #if STRN == 0
+      int ng = ni + get_local_id(1)*(NWI/VWN);
+    #elif STRN == 1
+      int ng = get_local_id(1) + ni*NDIMC;
+    #endif
+    bpm[ni] = blm[kg*(NWG/VWN) + ng];
+  }
+}
+#endif
+
+// =================================================================================================
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
+                         const real alpha, const real beta) {
+  #pragma unroll
+  for (int ni=0; ni<NWI; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWI/VWM; ++mi) {
+      #if STRM == 0
+        int mg = mi + get_local_id(0)*(MWI/VWM);
+      #elif STRM == 1
+        int mg = get_local_id(0) + mi*MDIMC;
+      #endif
+      #if STRN == 0
+        int ng = ni + get_local_id(1)*NWI;
+      #elif STRN == 1
+        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
+      #endif
+      int idm = mg + get_group_id(0)*(MWG/VWM);
+      int idn = ng + get_group_id(1)*NWG;
+      int index = idn*(kSizeM/VWM) + idm;
+      #if VWM == 1
+        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cgm[index]);
+      #elif VWM == 2
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cgm[index].x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cgm[index].y);
+      #elif VWM == 4
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cgm[index].x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cgm[index].y);
+        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cgm[index].z);
+        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cgm[index].w);
+      #elif VWM == 8
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cgm[index].s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cgm[index].s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cgm[index].s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cgm[index].s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cgm[index].s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cgm[index].s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cgm[index].s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cgm[index].s7);
+      #elif VWM == 16
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cgm[index].s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cgm[index].s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cgm[index].s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cgm[index].s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cgm[index].s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cgm[index].s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cgm[index].s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cgm[index].s7);
+        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cgm[index].s8);
+        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cgm[index].s9);
+        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cgm[index].sA);
+        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cgm[index].sB);
+        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cgm[index].sC);
+        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cgm[index].sD);
+        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cgm[index].sE);
+        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cgm[index].sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
+// The vectorised multiply-add function
+inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
+  #if USE_VECTOR_MAD == 1
+    cvec += avec * bval;
+  #else
+    #if VWM == 1
+      MultiplyAdd(cvec,    avec,    bval);
+    #elif VWM == 2
+      MultiplyAdd(cvec.x , avec.x,  bval);
+      MultiplyAdd(cvec.y , avec.y,  bval);
+    #elif VWM == 4
+      MultiplyAdd(cvec.x , avec.x,  bval);
+      MultiplyAdd(cvec.y , avec.y,  bval);
+      MultiplyAdd(cvec.z , avec.z,  bval);
+      MultiplyAdd(cvec.w , avec.w,  bval);
+    #elif VWM == 8
+      MultiplyAdd(cvec.s0, avec.s0, bval);
+      MultiplyAdd(cvec.s1, avec.s1, bval);
+      MultiplyAdd(cvec.s2, avec.s2, bval);
+      MultiplyAdd(cvec.s3, avec.s3, bval);
+      MultiplyAdd(cvec.s4, avec.s4, bval);
+      MultiplyAdd(cvec.s5, avec.s5, bval);
+      MultiplyAdd(cvec.s6, avec.s6, bval);
+      MultiplyAdd(cvec.s7, avec.s7, bval);
+    #elif VWM == 16
+      MultiplyAdd(cvec.s0, avec.s0, bval);
+      MultiplyAdd(cvec.s1, avec.s1, bval);
+      MultiplyAdd(cvec.s2, avec.s2, bval);
+      MultiplyAdd(cvec.s3, avec.s3, bval);
+      MultiplyAdd(cvec.s4, avec.s4, bval);
+      MultiplyAdd(cvec.s5, avec.s5, bval);
+      MultiplyAdd(cvec.s6, avec.s6, bval);
+      MultiplyAdd(cvec.s7, avec.s7, bval);
+      MultiplyAdd(cvec.s8, avec.s8, bval);
+      MultiplyAdd(cvec.s9, avec.s9, bval);
+      MultiplyAdd(cvec.sA, avec.sA, bval);
+      MultiplyAdd(cvec.sB, avec.sB, bval);
+      MultiplyAdd(cvec.sC, avec.sC, bval);
+      MultiplyAdd(cvec.sD, avec.sD, bval);
+      MultiplyAdd(cvec.sE, avec.sE, bval);
+      MultiplyAdd(cvec.sF, avec.sF, bval);
+    #endif
+  #endif
+  return cvec;
+}
+
+// Performs the actual computation: Cpm += Apm * Bpm
+inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) {
+  #pragma unroll
+  for (int ni=0; ni<NWI/VWN; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWI/VWM; ++mi) {
+      #if VWN == 1
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni]);
+      #elif VWN == 2
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
+        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
+      #elif VWN == 4
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
+        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
+        cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].z);
+        cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].w);
+      #elif VWN == 8
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].s0);
+        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].s1);
+        cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].s2);
+        cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].s3);
+        cpm[ni*VWN + 4][mi] = MultiplyAddVector(cpm[ni*VWN + 4][mi], apm[mi], bpm[ni].s4);
+        cpm[ni*VWN + 5][mi] = MultiplyAddVector(cpm[ni*VWN + 5][mi], apm[mi], bpm[ni].s5);
+        cpm[ni*VWN + 6][mi] = MultiplyAddVector(cpm[ni*VWN + 6][mi], apm[mi], bpm[ni].s6);
+        cpm[ni*VWN + 7][mi] = MultiplyAddVector(cpm[ni*VWN + 7][mi], apm[mi], bpm[ni].s7);
+      #elif VWN == 16
+        cpm[ni*VWN + 0 ][mi] = MultiplyAddVector(cpm[ni*VWN + 0 ][mi], apm[mi], bpm[ni].s0);
+        cpm[ni*VWN + 1 ][mi] = MultiplyAddVector(cpm[ni*VWN + 1 ][mi], apm[mi], bpm[ni].s1);
+        cpm[ni*VWN + 2 ][mi] = MultiplyAddVector(cpm[ni*VWN + 2 ][mi], apm[mi], bpm[ni].s2);
+        cpm[ni*VWN + 3 ][mi] = MultiplyAddVector(cpm[ni*VWN + 3 ][mi], apm[mi], bpm[ni].s3);
+        cpm[ni*VWN + 4 ][mi] = MultiplyAddVector(cpm[ni*VWN + 4 ][mi], apm[mi], bpm[ni].s4);
+        cpm[ni*VWN + 5 ][mi] = MultiplyAddVector(cpm[ni*VWN + 5 ][mi], apm[mi], bpm[ni].s5);
+        cpm[ni*VWN + 6 ][mi] = MultiplyAddVector(cpm[ni*VWN + 6 ][mi], apm[mi], bpm[ni].s6);
+        cpm[ni*VWN + 7 ][mi] = MultiplyAddVector(cpm[ni*VWN + 7 ][mi], apm[mi], bpm[ni].s7);
+        cpm[ni*VWN + 8 ][mi] = MultiplyAddVector(cpm[ni*VWN + 8 ][mi], apm[mi], bpm[ni].s8);
+        cpm[ni*VWN + 9 ][mi] = MultiplyAddVector(cpm[ni*VWN + 9 ][mi], apm[mi], bpm[ni].s9);
+        cpm[ni*VWN + 10][mi] = MultiplyAddVector(cpm[ni*VWN + 10][mi], apm[mi], bpm[ni].sA);
+        cpm[ni*VWN + 11][mi] = MultiplyAddVector(cpm[ni*VWN + 11][mi], apm[mi], bpm[ni].sB);
+        cpm[ni*VWN + 12][mi] = MultiplyAddVector(cpm[ni*VWN + 12][mi], apm[mi], bpm[ni].sC);
+        cpm[ni*VWN + 13][mi] = MultiplyAddVector(cpm[ni*VWN + 13][mi], apm[mi], bpm[ni].sD);
+        cpm[ni*VWN + 14][mi] = MultiplyAddVector(cpm[ni*VWN + 14][mi], apm[mi], bpm[ni].sE);
+        cpm[ni*VWN + 15][mi] = MultiplyAddVector(cpm[ni*VWN + 15][mi], apm[mi], bpm[ni].sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
+// Main entry of the kernel. This function contains the basic skeleton, the functionality is
+// provided by the inlined functions above
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+                    const real alpha, const real beta,
+                    const __global realM* restrict agm,
+                    const __global realN* restrict bgm,
+                    __global realM* cgm) {
+
+  // Combined thread identifier
+  #if SA == 1 || SB == 1
+    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
+  #endif
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+  
+  // Allocates workitem-private memory (registers)
+  realM apm[MWI/VWM];
+  realN bpm[NWI/VWN];
+  realM cpm[NWI][MWI/VWM];
+
+  // Initializes the accumulation registers
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+    #pragma unroll
+    for (int ni=0; ni<NWI; ++ni) {
+      #if VWM == 1
+        SetToZero(cpm[ni][mi]);
+      #elif VWM == 2
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+      #elif VWM == 4
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+        SetToZero(cpm[ni][mi].z);
+        SetToZero(cpm[ni][mi].w);
+      #elif VWM == 8
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+      #elif VWM == 16
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+        SetToZero(cpm[ni][mi].s8);
+        SetToZero(cpm[ni][mi].s9);
+        SetToZero(cpm[ni][mi].sA);
+        SetToZero(cpm[ni][mi].sB);
+        SetToZero(cpm[ni][mi].sC);
+        SetToZero(cpm[ni][mi].sD);
+        SetToZero(cpm[ni][mi].sE);
+        SetToZero(cpm[ni][mi].sF);
+      #endif
+    }
+  }
+
+  // Loops over all workgroup tiles
+  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
+
+    // Loads data: off-chip --> local (matrix A)
+    #if SA == 1
+      GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
+    #endif
+    // Loads data: off-chip --> local (matrix B)
+    #if SB == 1
+      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
+    #endif
+
+    // Synchronizes all threads in a workgroup
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+
+    // Loops over all workitem tiles, unrolled by a factor KWI
+    for (int pwi=0; pwi<KWG; pwi+=KWI) {
+      #pragma unroll
+      for (int pit=0; pit<KWI; ++pit) {
+        #if SA == 0 || SB == 0
+          int idk = kwg + pwi + pit;
+        #endif
+        #if SA == 1 || SB == 1
+          int kg = pwi+pit;
+        #endif
+
+        // Loads data: local --> private (matrix A)
+        #if SA == 1
+          LocalToPrivateA(alm, apm, kg);
+        // Loads data: off-chip --> private (matrix A)
+        #else
+          GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+        #endif
+
+        // Loads data: local --> private (matrix B)
+        #if SB == 1
+          LocalToPrivateB(blm, bpm, kg);
+        // Loads data: off-chip --> private (matrix B)
+        #else
+          GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+        #endif
+
+        // Performs the accumulation (Cpm += Apm * Bpm)
+        MultiplyAccumulate(cpm, apm, bpm);
+      }
+    }
+
+    // Synchronizes all threads in a workgroup
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+  }
+
+  // Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeM, alpha, beta);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/routine.cc b/src/routine.cc
new file mode 100644
index 00000000..32face4a
--- /dev/null
+++ b/src/routine.cc
@@ -0,0 +1,326 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Routine base class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routine.h"
+
+#include "internal/utilities.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The cache of compiled OpenCL programs
+std::vector<Routine::ProgramCache> Routine::program_cache_;
+
+// Constructor: not much here, because no status codes can be returned
+Routine::Routine(CommandQueue &queue, Event &event,
+                 const std::vector<std::string> &routines, const Precision precision):
+    precision_(precision),
+    queue_(queue),
+    event_(event),
+    context_(queue_.GetContext()),
+    device_(queue_.GetDevice()),
+    device_name_(device_.Name()),
+    max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
+    max_work_item_sizes_(device_.MaxWorkItemSizes()),
+    max_work_group_size_(device_.MaxWorkGroupSize()),
+    db_(queue_, routines, precision_),
+    routines_(routines) {
+}
+
+// =================================================================================================
+
+// Separate set-up function to allow for status codes to be returned
+StatusCode Routine::SetUp(const std::string &routine_source) {
+
+  // Queries the cache to see whether or not the compiled kernel is already there. If not, it will
+  // be built and added to the cache.
+  if (!ProgramIsInCache()) {
+
+    // Inspects whether or not cl_khr_fp64 is supported in case of double precision
+    auto extensions = device_.Extensions();
+    if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
+      if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
+        return StatusCode::kNoDoublePrecision;
+      }
+    }
+
+    // As above, but for cl_khr_fp16 (half precision)
+    if (precision_ == Precision::kHalf) {
+      if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
+        return StatusCode::kNoHalfPrecision;
+      }
+    }
+
+    // Loads the common header (typedefs and defines and such)
+    std::string common_header =
+    #include "kernels/common.opencl"
+
+    // Collects the parameters for this device in the form of defines, and adds the precision
+    auto defines = db_.GetDefines();
+    defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
+    auto source_string = defines + common_header + routine_source;
+
+    // Compiles the kernel
+    try {
+      auto program = Program(context_, source_string);
+      auto options = std::string{};
+      auto status = program.Build(device_, options);
+
+      // Checks for compiler crashes/errors/warnings
+      if (status == CL_BUILD_PROGRAM_FAILURE) {
+        auto message = program.GetBuildInfo(device_);
+        fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
+        return StatusCode::kBuildProgramFailure;
+      }
+      if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; }
+
+      // Store the compiled program in the cache
+      program_cache_.push_back({program, device_name_, precision_, routines_});
+    } catch (...) { return StatusCode::kBuildProgramFailure; }
+  }
+
+  // No errors, normal termination of this function
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Enqueues a kernel, waits for completion, and checks for errors
+StatusCode Routine::RunKernel(const Kernel &kernel, std::vector<size_t> &global,
+                              const std::vector<size_t> &local) {
+
+  // Tests for validity of the local thread sizes
+  if (local.size() > max_work_item_dimensions_) {
+    return StatusCode::kInvalidLocalNumDimensions; 
+  }
+  for (auto i=size_t{0}; i<local.size(); ++i) {
+    if (local[i] > max_work_item_sizes_[i]) { return StatusCode::kInvalidLocalThreadsDim; }
+  }
+  auto local_size = size_t{1};
+  for (auto &item: local) { local_size *= item; }
+  if (local_size > max_work_group_size_) { return StatusCode::kInvalidLocalThreadsTotal; }
+
+  // Make sure the global thread sizes are at least equal to the local sizes
+  for (auto i=size_t{0}; i<global.size(); ++i) {
+    if (global[i] < local[i]) { global[i] = local[i]; }
+  }
+
+  // Tests for local memory usage
+  auto local_mem_usage = kernel.LocalMemUsage(device_);
+  if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
+
+  // Launches the kernel (and checks for launch errors)
+  auto status = queue_.EnqueueKernel(kernel, global, local, event_);
+  if (status != CL_SUCCESS) { return StatusCode::kKernelLaunchError; }
+
+  // Waits for completion of the kernel
+  status = event_.Wait();
+  if (status != CL_SUCCESS) { return StatusCode::kKernelRunError; }
+
+  // No errors, normal termination of this function
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
+// sufficient buffer size.
+StatusCode Routine::TestMatrixA(const size_t one, const size_t two, const Buffer &buffer,
+                                const size_t offset, const size_t ld, const size_t data_size) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimA; }
+  try {
+    auto required_size = (ld*two + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
+  } catch (...) { return StatusCode::kInvalidMatrixA; }
+  return StatusCode::kSuccess;
+}
+
+// Tests matrix B for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
+// sufficient buffer size.
+StatusCode Routine::TestMatrixB(const size_t one, const size_t two, const Buffer &buffer,
+                                const size_t offset, const size_t ld, const size_t data_size) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimB; }
+  try {
+    auto required_size = (ld*two + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; }
+  } catch (...) { return StatusCode::kInvalidMatrixB; }
+  return StatusCode::kSuccess;
+}
+
+// Tests matrix C for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
+// sufficient buffer size.
+StatusCode Routine::TestMatrixC(const size_t one, const size_t two, const Buffer &buffer,
+                                const size_t offset, const size_t ld, const size_t data_size) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimC; }
+  try {
+    auto required_size = (ld*two + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; }
+  } catch (...) { return StatusCode::kInvalidMatrixC; }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
+// sufficient buffer size.
+StatusCode Routine::TestVectorX(const size_t n, const Buffer &buffer, const size_t offset,
+                                const size_t inc, const size_t data_size) {
+  if (inc == 0) { return StatusCode::kInvalidIncrementX; }
+  try {
+    auto required_size = (n*inc + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; }
+  } catch (...) { return StatusCode::kInvalidVectorX; }
+  return StatusCode::kSuccess;
+}
+
+// Tests vector Y for validity: checks for a valid increment, a valid OpenCL buffer, and for a
+// sufficient buffer size.
+StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
+                                const size_t inc, const size_t data_size) {
+  if (inc == 0) { return StatusCode::kInvalidIncrementY; }
+  try {
+    auto required_size = (n*inc + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; }
+  } catch (...) { return StatusCode::kInvalidVectorY; }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Copies a matrix and pads it with zeros
+StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
+                                           const size_t src_ld, const size_t src_offset,
+                                           const Buffer &src,
+                                           const size_t dest_one, const size_t dest_two,
+                                           const size_t dest_ld, const size_t dest_offset,
+                                           const Buffer &dest,
+                                           const bool do_transpose, const bool pad,
+                                           const Program &program) {
+
+  // Determines whether or not the fast-version could potentially be used
+  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) &&
+                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
+
+  // Determines the right kernel
+  auto kernel_name = std::string{};
+  if (do_transpose) {
+    if (use_fast_kernel &&
+        IsMultiple(src_ld, db_["TRA_WPT"]) &&
+        IsMultiple(src_one, db_["TRA_WPT"]*db_["TRA_WPT"]) &&
+        IsMultiple(src_two, db_["TRA_WPT"]*db_["TRA_WPT"])) {
+      kernel_name = "TransposeMatrix";
+    }
+    else {
+      use_fast_kernel = false;
+      kernel_name = (pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
+    }
+  }
+  else {
+    if (use_fast_kernel &&
+        IsMultiple(src_ld, db_["COPY_VW"]) &&
+        IsMultiple(src_one, db_["COPY_VW"]*db_["COPY_DIMX"]) &&
+        IsMultiple(src_two, db_["COPY_WPT"]*db_["COPY_DIMY"])) {
+      kernel_name = "CopyMatrix";
+    }
+    else {
+      use_fast_kernel = false;
+      kernel_name = (pad) ? "PadMatrix" : "UnPadMatrix";
+    }
+  }
+
+  // Retrieves the kernel from the compiled binary
+  try {
+    auto kernel = Kernel(program, kernel_name);
+
+    // Sets the kernel arguments
+    if (use_fast_kernel) {
+      kernel.SetArgument(0, static_cast<int>(src_ld));
+      kernel.SetArgument(1, src());
+      kernel.SetArgument(2, dest());
+    }
+    else {
+      kernel.SetArgument(0, static_cast<int>(src_one));
+      kernel.SetArgument(1, static_cast<int>(src_two));
+      kernel.SetArgument(2, static_cast<int>(src_ld));
+      kernel.SetArgument(3, static_cast<int>(src_offset));
+      kernel.SetArgument(4, src());
+      kernel.SetArgument(5, static_cast<int>(dest_one));
+      kernel.SetArgument(6, static_cast<int>(dest_two));
+      kernel.SetArgument(7, static_cast<int>(dest_ld));
+      kernel.SetArgument(8, static_cast<int>(dest_offset));
+      kernel.SetArgument(9, dest());
+    }
+
+    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+    // parameters in the database.
+    auto status = StatusCode::kSuccess;
+    if (do_transpose) {
+      if (use_fast_kernel) {
+        auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"],
+                                          dest_two / db_["TRA_WPT"]};
+        auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
+        status = RunKernel(kernel, global, local);
+      }
+      else {
+        auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
+                                          Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])};
+        auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
+        status = RunKernel(kernel, global, local);
+      }
+    }
+    else {
+      if (use_fast_kernel) {
+        auto global = std::vector<size_t>{dest_one / db_["COPY_VW"],
+                                          dest_two / db_["COPY_WPT"]};
+        auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
+        status = RunKernel(kernel, global, local);
+      }
+      else {
+        auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                          Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+        auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+        status = RunKernel(kernel, global, local);
+      }
+    }
+    return status;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
+// otherwise.
+Program Routine::GetProgramFromCache() const {
+  for (auto &cached_program: program_cache_) {
+    if (cached_program.MatchInCache(device_name_, precision_, routines_)) {
+      return cached_program.program;
+    }
+  }
+  throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
+}
+
+// Queries the cache to see whether or not the compiled kernel is already there
+bool Routine::ProgramIsInCache() const {
+  for (auto &cached_program: program_cache_) {
+    if (cached_program.MatchInCache(device_name_, precision_, routines_)) { return true; }
+  }
+  return false;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/xaxpy.cc b/src/routines/xaxpy.cc
new file mode 100644
index 00000000..309ae3ce
--- /dev/null
+++ b/src/routines/xaxpy.cc
@@ -0,0 +1,115 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/xaxpy.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
+template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
+template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Xaxpy"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
+                            const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { return StatusCode::kInvalidDimension; }
+
+  // Tests the vectors for validity
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines whether or not the fast-version can be used
+  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
+                         (y_offset == 0) && (y_inc == 1) &&
+                         IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+
+  // If possible, run the fast-version of the kernel
+  auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
+
+  // Retrieves the Xaxpy kernel from the compiled binary
+  try {
+    auto program = GetProgramFromCache();
+    auto kernel = Kernel(program, kernel_name);
+
+    // Sets the kernel arguments
+    if (use_fast_kernel) {
+      kernel.SetArgument(0, static_cast<int>(n));
+      kernel.SetArgument(1, alpha);
+      kernel.SetArgument(2, x_buffer());
+      kernel.SetArgument(3, y_buffer());
+    }
+    else {
+      kernel.SetArgument(0, static_cast<int>(n));
+      kernel.SetArgument(1, alpha);
+      kernel.SetArgument(2, x_buffer());
+      kernel.SetArgument(3, static_cast<int>(x_offset));
+      kernel.SetArgument(4, static_cast<int>(x_inc));
+      kernel.SetArgument(5, y_buffer());
+      kernel.SetArgument(6, static_cast<int>(y_offset));
+      kernel.SetArgument(7, static_cast<int>(y_inc));
+    }
+
+    // Launches the kernel
+    if (use_fast_kernel) {
+      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+      auto local = std::vector<size_t>{db_["WGS"]};
+      status = RunKernel(kernel, global, local);
+    }
+    else {
+      auto n_ceiled = Ceil(n, db_["WGS"]);
+      auto global = std::vector<size_t>{CeilDiv(n_ceiled, db_["WPT"])};
+      auto local = std::vector<size_t>{db_["WGS"]};
+      status = RunKernel(kernel, global, local);
+    }
+    if (ErrorIn(status)) { return status; }
+
+    // Waits for all kernels to finish
+    queue_.Finish();
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xaxpy<float>;
+template class Xaxpy<double>;
+template class Xaxpy<float2>;
+template class Xaxpy<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc
new file mode 100644
index 00000000..16bbc154
--- /dev/null
+++ b/src/routines/xgemm.cc
@@ -0,0 +1,168 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/xgemm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
+template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
+template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgemm<T>::DoGemm(const Layout layout,
+                            const Transpose a_transpose, const Transpose b_transpose,
+                            const size_t m, const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed. Note
+  // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of
+  // col-major) to be transformed, so transposing requirements are not the same as whether or not
+  // the matrix is actually transposed in memory.
+  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
+  auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+  auto a_do_transpose =  a_rotated;
+  auto b_do_transpose = !b_rotated;
+  auto c_do_transpose =  c_rotated;
+
+  // Computes the first and second dimensions of the 3 matrices taking into account whether the
+  // matrices are rotated or not
+  auto a_one = (a_rotated) ? k : m;
+  auto a_two = (a_rotated) ? m : k;
+  auto b_one = (b_rotated) ? n : k;
+  auto b_two = (b_rotated) ? k : n;
+  auto c_one = (c_rotated) ? n : m;
+  auto c_two = (c_rotated) ? m : n;
+
+  // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than K when rotated, or less than M when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N when rotated, or less than M when not-rotated
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of m, n, and k
+  auto m_ceiled = Ceil(m, db_["MWG"]);
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
+    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto program = GetProgramFromCache();
+
+    // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
+    // them up until they reach a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                    m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
+                                    a_do_transpose, true, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
+                                    b_do_transpose, true, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Only necessary for matrix C if it used both as input and output
+    if (beta != static_cast<T>(0)) {
+      status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
+                                      m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
+                                      c_do_transpose, true, program);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Retrieves the Xgemm kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, "Xgemm");
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(m_ceiled));
+      kernel.SetArgument(1, static_cast<int>(n_ceiled));
+      kernel.SetArgument(2, static_cast<int>(k_ceiled));
+      kernel.SetArgument(3, alpha);
+      kernel.SetArgument(4, beta);
+      kernel.SetArgument(5, temp_a());
+      kernel.SetArgument(6, temp_b());
+      kernel.SetArgument(7, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (m_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
+                                      c_one, c_two, c_ld, c_offset, c_buffer,
+                                      c_do_transpose, false, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgemm<float>;
+template class Xgemm<double>;
+template class Xgemm<float2>;
+template class Xgemm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/xsymm.cc b/src/routines/xsymm.cc
new file mode 100644
index 00000000..aa43593d
--- /dev/null
+++ b/src/routines/xsymm.cc
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/xsymm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsymm<T>::Xsymm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
+  // left) or B (on the right) in the Xgemm routine.
+  size_t k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the squared A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the symmetrix matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
+
+  // Temporary buffer for a copy of the symmetric matrix
+  try {
+    auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the symmetric-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_symm());
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // symmetry-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "C := AB+C" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_symm, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+      }
+
+      // ... with "C := BA+C". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_symm, 0, k,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsymm<float>;
+template class Xsymm<double>;
+template class Xsymm<float2>;
+template class Xsymm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc
new file mode 100644
index 00000000..da223bf0
--- /dev/null
+++ b/src/tuning/copy.cc
@@ -0,0 +1,83 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the copy OpenCL kernels. It uses CLTune.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The copy auto-tuner
+template <typename T>
+void CopyTune(const Arguments<T> &args,
+              const std::vector<T> &a_mat, std::vector<T> &b_mat,
+              cltune::Tuner &tuner) {
+
+  // This points to the CopyMatrix kernel as found in the CLBlast library. This is just one example
+  // of a copy kernel. However, all copy-kernels use the same tuning parameters, so one has to be
+  // chosen as a representative.
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/copy.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "CopyMatrix", {args.m, args.n}, {1, 1});
+  tuner.SetReferenceFromString(sources, "CopyMatrix", {args.m, args.n}, {8, 8});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32});
+  tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32});
+  tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8});
+  tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Modifies the thread-sizes (both global and local) based on the parameters
+  tuner.MulLocalSize(id, {"COPY_DIMX", "COPY_DIMY"});
+  tuner.DivGlobalSize(id, {"COPY_VW", "COPY_WPT"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentInput(a_mat);
+  tuner.AddArgumentOutput(b_mat);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerCopy(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerAB<float>(argc, argv, CopyTune<float>); break;
+    case Precision::kDouble: TunerAB<double>(argc, argv, CopyTune<double>); break;
+    case Precision::kComplexSingle: TunerAB<float2>(argc, argv, CopyTune<float2>); break;
+    case Precision::kComplexDouble: TunerAB<double2>(argc, argv, CopyTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerCopy(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc
new file mode 100644
index 00000000..93312df2
--- /dev/null
+++ b/src/tuning/pad.cc
@@ -0,0 +1,90 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the pad-copy OpenCL kernels. It uses CLTune.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The pad auto-tuner
+template <typename T>
+void PadTune(const Arguments<T> &args,
+              const std::vector<T> &a_mat, std::vector<T> &b_mat,
+              cltune::Tuner &tuner) {
+
+  // This points to the PadMatrix kernel as found in the CLBlast library. This is just one
+  // example of a pad kernel. However, all pad-kernels use the same tuning parameters, so one has
+  // to be chosen as a representative.
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/pad.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "PadMatrix", {args.m, args.n}, {1, 1});
+  tuner.SetReferenceFromString(sources, "PadMatrix", {args.m, args.n}, {8, 8});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32});
+  tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32});
+  tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4});
+  tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Modifies the thread-sizes (both global and local) based on the parameters
+  tuner.MulLocalSize(id, {"PAD_DIMX", "PAD_DIMY"});
+  tuner.DivGlobalSize(id, {"PAD_WPTX", "PAD_WPTY"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(0);
+  tuner.AddArgumentInput(a_mat);
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(0);
+  tuner.AddArgumentOutput(b_mat);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerPad(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerAB<float>(argc, argv, PadTune<float>); break;
+    case Precision::kDouble: TunerAB<double>(argc, argv, PadTune<double>); break;
+    case Precision::kComplexSingle: TunerAB<float2>(argc, argv, PadTune<float2>); break;
+    case Precision::kComplexDouble: TunerAB<double2>(argc, argv, PadTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerPad(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc
new file mode 100644
index 00000000..b2af9925
--- /dev/null
+++ b/src/tuning/padtranspose.cc
@@ -0,0 +1,95 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the pad-transpose OpenCL kernels. It uses CLTune.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The transpose auto-tuner
+template <typename T>
+void PadTransposeTune(const Arguments<T> &args,
+                      const std::vector<T> &a_mat, std::vector<T> &b_mat,
+                      cltune::Tuner &tuner) {
+
+  // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
+  // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
+  // to be chosen as a representative.
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/padtranspose.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {1, 1});
+  tuner.SetReferenceFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {8, 8});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64});
+  tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16});
+  tuner.AddParameter(id, "PADTRA_PAD", {0, 1});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Sets the constraints for local memory size limitations
+  auto LocalMemorySize = [args] (std::vector<size_t> v) {
+    return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
+  };
+  tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"});
+
+  // Modifies the thread-sizes (both global and local) based on the parameters
+  tuner.DivGlobalSize(id, {"PADTRA_WPT", "PADTRA_WPT"});
+  tuner.MulLocalSize(id, {"PADTRA_TILE", "PADTRA_TILE"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(0);
+  tuner.AddArgumentInput(a_mat);
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(0);
+  tuner.AddArgumentOutput(b_mat);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerPadTranspose(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerAB<float>(argc, argv, PadTransposeTune<float>); break;
+    case Precision::kDouble: TunerAB<double>(argc, argv, PadTransposeTune<double>); break;
+    case Precision::kComplexSingle: TunerAB<float2>(argc, argv, PadTransposeTune<float2>); break;
+    case Precision::kComplexDouble: TunerAB<double2>(argc, argv, PadTransposeTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerPadTranspose(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc
new file mode 100644
index 00000000..90392866
--- /dev/null
+++ b/src/tuning/transpose.cc
@@ -0,0 +1,88 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the transpose OpenCL kernels. It uses CLTune.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The transpose auto-tuner
+template <typename T>
+void TransposeTune(const Arguments<T> &args,
+                   const std::vector<T> &a_mat, std::vector<T> &b_mat,
+                   cltune::Tuner &tuner) {
+
+  // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
+  // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
+  // to be chosen as a representative.
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/transpose.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "TransposeMatrix", {args.m, args.n}, {1, 1});
+  tuner.SetReferenceFromString(sources, "TransposeMatrix", {args.m, args.n}, {8, 8});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64});
+  tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16});
+  tuner.AddParameter(id, "TRA_PAD", {0, 1});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Sets the constraints for local memory size limitations
+  auto LocalMemorySize = [args] (std::vector<size_t> v) {
+    return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
+  };
+  tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"});
+
+  // Modifies the thread-sizes (both global and local) based on the parameters
+  tuner.DivGlobalSize(id, {"TRA_WPT", "TRA_WPT"});
+  tuner.MulLocalSize(id, {"TRA_DIM", "TRA_DIM"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentInput(a_mat);
+  tuner.AddArgumentOutput(b_mat);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerTranspose(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerAB<float>(argc, argv, TransposeTune<float>); break;
+    case Precision::kDouble: TunerAB<double>(argc, argv, TransposeTune<double>); break;
+    case Precision::kComplexSingle: TunerAB<float2>(argc, argv, TransposeTune<float2>); break;
+    case Precision::kComplexDouble: TunerAB<double2>(argc, argv, TransposeTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerTranspose(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc
new file mode 100644
index 00000000..bb93c053
--- /dev/null
+++ b/src/tuning/tuning.cc
@@ -0,0 +1,186 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the common auto-tuning code to interface with the CLTune library.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for vector-vector routines.
+template <typename T>
+void TunerXY(int argc, char* argv[], const Tuner2<T> &tune_function) {
+
+  // Sets the parameters and platform/device for which to tune (command-line options)
+  auto help = std::string{"* Options given/available:\n"};
+  auto args = Arguments<T>{};
+  args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+  args.device_id   = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+  args.precision   = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+  args.n           = GetArgument(argc, argv, help, kArgN, size_t{4096*1024});
+  args.alpha       = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>());
+  fprintf(stdout, "%s\n", help.c_str());
+
+  // Creates input buffers with random data
+  auto x_vec = std::vector<T>(args.n);
+  auto y_vec = std::vector<T>(args.n);
+  PopulateVector(x_vec);
+  PopulateVector(y_vec);
+
+  // Initializes the tuner for the chosen device
+  cltune::Tuner tuner(args.platform_id, args.device_id);
+
+  // Use full-search to explore all parameter combinations.
+  tuner.UseFullSearch();
+
+  // Configures the tuning parameters (kernel specific)
+  tune_function(args, x_vec, y_vec, tuner);
+
+  // Starts the tuning process
+  tuner.Tune();
+
+  // Prints the results to screen
+  auto time_ms = tuner.PrintToScreen();
+  tuner.PrintFormatted();
+
+  // Also prints the performance of the best-case in terms of GB/s
+  const auto mega_bytes = (3*args.n*GetBytes(args.precision)) * 1.0e-6;
+  if (time_ms != 0.0) {
+    printf("[ -------> ] %.1lf ms or %.1lf GB/s\n", time_ms, mega_bytes/time_ms);
+  }
+}
+
+// Compiles the above function
+template void TunerXY<float>(int, char**, const Tuner2<float>&);
+template void TunerXY<double>(int, char**, const Tuner2<double>&);
+template void TunerXY<float2>(int, char**, const Tuner2<float2>&);
+template void TunerXY<double2>(int, char**, const Tuner2<double2>&);
+
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for matrix-matrix routines.
+template <typename T>
+void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function) {
+
+  // Sets the parameters and platform/device for which to tune (command-line options)
+  auto help = std::string{"* Options given/available:\n"};
+  auto args = Arguments<T>{};
+  args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+  args.device_id   = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+  args.precision   = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+  args.m           = GetArgument(argc, argv, help, kArgM, size_t{1024});
+  args.n           = GetArgument(argc, argv, help, kArgN, size_t{1024});
+  args.fraction    = GetArgument(argc, argv, help, kArgFraction, 2048.0);
+  fprintf(stdout, "%s\n", help.c_str());
+
+  // Creates input buffers with random data
+  auto a_mat = std::vector<T>(args.m * args.n);
+  auto b_mat = std::vector<T>(args.m * args.n);
+  PopulateVector(a_mat);
+  PopulateVector(b_mat);
+
+  // Initializes the tuner for the chosen device
+  cltune::Tuner tuner(args.platform_id, args.device_id);
+
+  // Use full-search to explore all parameter combinations.
+  tuner.UseFullSearch();
+
+  // Configures the tuning parameters (kernel specific)
+  tune_function(args, a_mat, b_mat, tuner);
+
+  // Starts the tuning process
+  tuner.Tune();
+
+  // Prints the results to screen
+  auto time_ms = tuner.PrintToScreen();
+  tuner.PrintFormatted();
+
+  // Also prints the performance of the best-case in terms of GB/s
+  const auto mega_bytes = (2*args.m*args.n*GetBytes(args.precision)) * 1.0e-6;
+  if (time_ms != 0.0) {
+    printf("[ -------> ] %.1lf ms or %.1lf GB/s\n", time_ms, mega_bytes/time_ms);
+  }
+}
+
+// Compiles the above function
+template void TunerAB<float>(int, char**, const Tuner2<float>&);
+template void TunerAB<double>(int, char**, const Tuner2<double>&);
+template void TunerAB<float2>(int, char**, const Tuner2<float2>&);
+template void TunerAB<double2>(int, char**, const Tuner2<double2>&);
+
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for matrix-matrix-matrix routines.
+template <typename T>
+void TunerABC(int argc, char* argv[], const Tuner3<T> &tune_function) {
+
+  // Sets the parameters and platform/device for which to tune (command-line options)
+  auto help = std::string{"* Options given/available:\n"};
+  auto args = Arguments<T>{};
+  args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+  args.device_id   = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+  args.precision   = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+  args.m           = GetArgument(argc, argv, help, kArgM, size_t{1024});
+  args.n           = GetArgument(argc, argv, help, kArgN, size_t{1024});
+  args.k           = GetArgument(argc, argv, help, kArgK, size_t{1024});
+  args.alpha       = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>());
+  args.beta        = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>());
+  args.fraction    = GetArgument(argc, argv, help, kArgFraction, 2048.0);
+  fprintf(stdout, "%s\n", help.c_str());
+
+  // Creates input buffers with random data
+  auto a_mat = std::vector<T>(args.m * args.k);
+  auto b_mat = std::vector<T>(args.n * args.k);
+  auto c_mat = std::vector<T>(args.m * args.n);
+  PopulateVector(a_mat);
+  PopulateVector(b_mat);
+  PopulateVector(c_mat);
+
+  // Initializes the tuner for the chosen device
+  cltune::Tuner tuner(args.platform_id, args.device_id);
+
+  // Use random-search to search only a part of the parameter values. The fraction of the search-
+  // space to explore is set as a command-line argument.
+  tuner.UseRandomSearch(1.0/args.fraction);
+
+  // Configures the tuning parameters (kernel specific)
+  tune_function(args, a_mat, b_mat, c_mat, tuner);
+
+  // Starts the tuning process
+  tuner.Tune();
+
+  // Prints the results to screen
+  auto time_ms = tuner.PrintToScreen();
+  tuner.PrintFormatted();
+
+  // Also prints the performance of the best-case in terms of GFLOPS
+  const auto mega_flops = (2*args.m*args.n*args.k) * 1.0e-6;
+  if (time_ms != 0.0) {
+    printf("[ -------> ] %.1lf ms or %.1lf GFLOPS\n", time_ms, mega_flops/time_ms);
+  }
+}
+
+// Compiles the above function
+template void TunerABC<float>(int, char**, const Tuner3<float>&);
+template void TunerABC<double>(int, char**, const Tuner3<double>&);
+template void TunerABC<float2>(int, char**, const Tuner3<float2>&);
+template void TunerABC<double2>(int, char**, const Tuner3<double2>&);
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/xaxpy.cc b/src/tuning/xaxpy.cc
new file mode 100644
index 00000000..0439ed05
--- /dev/null
+++ b/src/tuning/xaxpy.cc
@@ -0,0 +1,88 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the Xaxpy OpenCL kernel. It uses the CLTune library.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The Xaxpy auto-tuner
+template <typename T>
+void XaxpyTune(const Arguments<T> &args,
+               const std::vector<T> &x_vec, std::vector<T> &y_vec,
+               cltune::Tuner &tuner) {
+
+  // The XaxpyFast kernel only works under certain conditions. Check here whether the condition is
+  // true for the reference kernel
+  if (!IsMultiple(args.n, 64)) {
+    throw std::runtime_error("The 'XaxpyFast' kernel requires 'n' to be a multiple of WGS*WPT*VW");
+  }
+
+  // This points to the XaxpyFast kernel as found in the CLBlast library
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/xaxpy.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "XaxpyFast", {args.n}, {1});
+  tuner.SetReferenceFromString(sources, "XaxpyFast", {args.n}, {64});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 2048});
+  tuner.AddParameter(id, "WPT", {1, 2, 4, 8});
+  tuner.AddParameter(id, "VW", {1, 2, 4, 8});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Modifies the thread-sizes (local) based on the parameters
+  tuner.MulLocalSize(id, {"WGS"});
+  tuner.DivGlobalSize(id, {"WPT"});
+  tuner.DivGlobalSize(id, {"VW"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(args.alpha);
+  tuner.AddArgumentInput(x_vec);
+  tuner.AddArgumentOutput(y_vec);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerXaxpy(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerXY<float>(argc, argv, XaxpyTune<float>); break;
+    case Precision::kDouble: TunerXY<double>(argc, argv, XaxpyTune<double>); break;
+    case Precision::kComplexSingle: TunerXY<float2>(argc, argv, XaxpyTune<float2>); break;
+    case Precision::kComplexDouble: TunerXY<double2>(argc, argv, XaxpyTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerXaxpy(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/xgemm.cc b/src/tuning/xgemm.cc
new file mode 100644
index 00000000..aba56810
--- /dev/null
+++ b/src/tuning/xgemm.cc
@@ -0,0 +1,126 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the Xgemm OpenCL kernel. It uses the CLTune library.
+// Note that this tuner uses random-search: running it multiple times or with a larger fraction
+// argument might be neccessary to obtain good results.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The Xgemm auto-tuner
+template <typename T>
+void XgemmTune(const Arguments<T> &args,
+               const std::vector<T> &a_mat, const std::vector<T> &b_mat, std::vector<T> &c_mat,
+               cltune::Tuner &tuner) {
+
+  // This points to the Xgemm kernel as found in the CLBlast library and its golden reference
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/xgemm.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "Xgemm", {args.m, args.n}, {1, 1});
+  tuner.SetReferenceFromString(sources, "Xgemm", {args.m, args.n}, {8, 8});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "MWG", {16, 32, 64, 128});
+  tuner.AddParameter(id, "NWG", {16, 32, 64, 128});
+  tuner.AddParameter(id, "KWG", {16, 32});
+  tuner.AddParameter(id, "MDIMC", {8, 16, 32});
+  tuner.AddParameter(id, "NDIMC", {8, 16, 32});
+  tuner.AddParameter(id, "MDIMA", {8, 16, 32});
+  tuner.AddParameter(id, "NDIMB", {8, 16, 32});
+  tuner.AddParameter(id, "KWI", {2, 8});
+  tuner.AddParameter(id, "VWM", {1, 2, 4, 8});
+  tuner.AddParameter(id, "VWN", {1, 2, 4, 8});
+  tuner.AddParameter(id, "STRM", {0, 1});
+  tuner.AddParameter(id, "STRN", {0, 1});
+  tuner.AddParameter(id, "SA", {0, 1});
+  tuner.AddParameter(id, "SB", {0, 1});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Sets the helper functions to implement the constraints below
+  auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
+  auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
+  auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
+
+  // Sets constraints: Requirement for unrolling the KWG loop
+  tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"});
+
+  // Sets constraints: Required for integer MWI and NWI
+  tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"});
+  tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"});
+
+  // Sets constraints: Required for integer MWIA and NWIB
+  tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"});
+  tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"});
+
+  // Sets constraints: KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...)
+  tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"});
+  tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"});
+
+  // Sets the constraints for local memory size limitations
+  auto LocalMemorySize = [args] (std::vector<size_t> v) {
+    return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision));
+  };
+  tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM",
+                                                  "SB", "KWG", "NWG", "VWN"});
+
+  // Modifies the thread-sizes (both global and local) based on the parameters
+  tuner.MulLocalSize(id, {"MDIMC", "NDIMC"});
+  tuner.MulGlobalSize(id, {"MDIMC", "NDIMC"});
+  tuner.DivGlobalSize(id, {"MWG", "NWG"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(static_cast<int>(args.k));
+  tuner.AddArgumentScalar(args.alpha);
+  tuner.AddArgumentScalar(args.beta);
+  tuner.AddArgumentInput(a_mat);
+  tuner.AddArgumentInput(b_mat);
+  tuner.AddArgumentOutput(c_mat);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerXgemm(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerABC<float>(argc, argv, XgemmTune<float>); break;
+    case Precision::kDouble: TunerABC<double>(argc, argv, XgemmTune<double>); break;
+    case Precision::kComplexSingle: TunerABC<float2>(argc, argv, XgemmTune<float2>); break;
+    case Precision::kComplexDouble: TunerABC<double2>(argc, argv, XgemmTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerXgemm(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/utilities.cc b/src/utilities.cc
new file mode 100644
index 00000000..80cea852
--- /dev/null
+++ b/src/utilities.cc
@@ -0,0 +1,255 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the common (test) utility functions.
+//
+// =================================================================================================
+
+#include "internal/utilities.h"
+
+#include <string>
+#include <vector>
+#include <chrono>
+#include <random>
+#include <iomanip>
+
+namespace clblast {
+// =================================================================================================
+
+// Implements the string conversion using std::to_string if possible
+template <typename T>
+std::string ToString(T value) {
+  return std::to_string(value);
+}
+template std::string ToString<int>(int value);
+template std::string ToString<size_t>(size_t value);
+template std::string ToString<float>(float value);
+template std::string ToString<double>(double value);
+
+// If not possible directly: special cases for complex data-types
+template <>
+std::string ToString(float2 value) {
+  std::ostringstream real, imag;
+  real << std::setprecision(2) << value.real();
+  imag << std::setprecision(2) << value.imag();
+  return real.str()+"+"+imag.str()+"i";
+}
+template <>
+std::string ToString(double2 value) {
+  std::ostringstream real, imag;
+  real << std::setprecision(2) << value.real();
+  imag << std::setprecision(2) << value.imag();
+  return real.str()+"+"+imag.str()+"i";
+}
+
+// If not possible directly: special cases for CLBlast data-types
+template <>
+std::string ToString(Layout value) {
+  switch(value) {
+    case Layout::kRowMajor: return ToString(static_cast<int>(value))+" (row-major)";
+    case Layout::kColMajor: return ToString(static_cast<int>(value))+" (col-major)";
+  }
+}
+template <>
+std::string ToString(Transpose value) {
+  switch(value) {
+    case Transpose::kNo: return ToString(static_cast<int>(value))+" (regular)";
+    case Transpose::kYes: return ToString(static_cast<int>(value))+" (transposed)";
+    case Transpose::kConjugate: return ToString(static_cast<int>(value))+" (conjugate)";
+  }
+}
+template <>
+std::string ToString(Side value) {
+  switch(value) {
+    case Side::kLeft: return ToString(static_cast<int>(value))+" (left)";
+    case Side::kRight: return ToString(static_cast<int>(value))+" (right)";
+  }
+}
+template <>
+std::string ToString(Triangle value) {
+  switch(value) {
+    case Triangle::kUpper: return ToString(static_cast<int>(value))+" (upper)";
+    case Triangle::kLower: return ToString(static_cast<int>(value))+" (lower)";
+  }
+}
+template <>
+std::string ToString(Precision value) {
+  switch(value) {
+    case Precision::kHalf: return ToString(static_cast<int>(value))+" (half)";
+    case Precision::kSingle: return ToString(static_cast<int>(value))+" (single)";
+    case Precision::kDouble: return ToString(static_cast<int>(value))+" (double)";
+    case Precision::kComplexSingle: return ToString(static_cast<int>(value))+" (complex-single)";
+    case Precision::kComplexDouble: return ToString(static_cast<int>(value))+" (complex-double)";
+  }
+}
+
+// =================================================================================================
+
+// Helper for the below function to convert the argument to the value type. Adds specialization for
+// complex data-types. Note that complex arguments are accepted as regular values and are copied to
+// both the real and imaginary parts.
+template <typename T>
+T ConvertArgument(const char* value) {
+  return static_cast<T>(std::stod(value));
+}
+template <> float2 ConvertArgument(const char* value) {
+  auto val = static_cast<float>(std::stod(value));
+  return float2{val, val};
+}
+template <> double2 ConvertArgument(const char* value) {
+  auto val = static_cast<double>(std::stod(value));
+  return double2{val, val};
+}
+
+// This function matches patterns in the form of "-option value" or "--option value". It returns a
+// default value in case the option is not found in the argument string.
+template <typename T>
+T GetArgument(const int argc, char *argv[], std::string &help,
+              const std::string &option, const T default_value) {
+
+  // Parses the argument. Note that this supports both the given option (e.g. -device) and one with
+  // an extra dash in front (e.g. --device).
+  auto return_value = static_cast<T>(default_value);
+  for (int c=0; c<argc; ++c) {
+    auto item = std::string{argv[c]};
+    if (item.compare("-"+option) == 0 || item.compare("--"+option) == 0) {
+      ++c;
+      return_value = ConvertArgument<T>(argv[c]);
+      break;
+    }
+  }
+
+  // Updates the help message and returns
+  help += "    -"+option+" "+ToString(return_value)+" ";
+  help += (return_value == default_value) ? "[=default]\n" : "\n";
+  return return_value;
+}
+
+// Compiles the above function
+template bool GetArgument<bool>(const int, char **, std::string&, const std::string&, const bool);
+template int GetArgument<int>(const int, char **, std::string&, const std::string&, const int);
+template size_t GetArgument<size_t>(const int, char **, std::string&, const std::string&, const size_t);
+template float GetArgument<float>(const int, char **, std::string&, const std::string&, const float);
+template double GetArgument<double>(const int, char **, std::string&, const std::string&, const double);
+template float2 GetArgument<float2>(const int, char **, std::string&, const std::string&, const float2);
+template double2 GetArgument<double2>(const int, char **, std::string&, const std::string&, const double2);
+template Layout GetArgument<Layout>(const int, char **, std::string&, const std::string&, const Layout);
+template Transpose GetArgument<Transpose>(const int, char **, std::string&, const std::string&, const Transpose);
+template Side GetArgument<Side>(const int, char **, std::string&, const std::string&, const Side);
+template Triangle GetArgument<Triangle>(const int, char **, std::string&, const std::string&, const Triangle);
+template Precision GetArgument<Precision>(const int, char **, std::string&, const std::string&, const Precision);
+
+// =================================================================================================
+
+// Returns only the precision argument
+Precision GetPrecision(const int argc, char *argv[]) {
+  auto dummy = std::string{};
+  return GetArgument(argc, argv, dummy, kArgPrecision, Precision::kSingle);
+}
+
+// =================================================================================================
+
+// Checks whether an argument is given. Returns true or false.
+bool CheckArgument(const int argc, char *argv[], std::string &help,
+                   const std::string &option) {
+
+  // Updates the help message
+  help += "    -"+option+"\n";
+
+  // Parses the argument. Note that this supports both the given option (e.g. -device) and one with
+  // an extra dash in front (e.g. --device).
+  for (int c=0; c<argc; ++c) {
+    auto item = std::string{argv[c]};
+    if (item.compare("-"+option) == 0 || item.compare("--"+option) == 0) { ++c; return true; }
+  }
+  return false;
+}
+
+// =================================================================================================
+
+// Returns a random seed. This used to be implemented using 'std::random_device', but that doesn't
+// always work. The chrono-timers are more reliable in that sense, but perhaps less random.
+unsigned int GetRandomSeed() {
+  return static_cast<unsigned int>(std::chrono::system_clock::now().time_since_epoch().count());
+}
+
+// Create a random number generator and populates a vector with samples from a random distribution
+template <typename T>
+void PopulateVector(std::vector<T> &vector) {
+  std::mt19937 mt(GetRandomSeed());
+  std::uniform_real_distribution<T> dist(static_cast<T>(-2.0), static_cast<T>(2.0));
+  for (auto &element: vector) { element = dist(mt); }
+}
+template void PopulateVector<float>(std::vector<float>&);
+template void PopulateVector<double>(std::vector<double>&);
+
+// Specialized versions of the above for complex data-types
+template <>
+void PopulateVector(std::vector<float2> &vector) {
+  std::mt19937 mt(GetRandomSeed());
+  std::uniform_real_distribution<float> dist(-2.0f, 2.0f);
+  for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
+}
+template <>
+void PopulateVector(std::vector<double2> &vector) {
+  std::mt19937 mt(GetRandomSeed());
+  std::uniform_real_distribution<double> dist(-2.0, 2.0);
+  for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
+}
+
+// =================================================================================================
+
+// Returns a scalar with a default value
+template <typename T>
+T GetScalar() {
+  return static_cast<T>(2.0);
+}
+template float GetScalar<float>();
+template double GetScalar<double>();
+
+// Specialized versions of the above for complex data-types
+template <>
+float2 GetScalar() {
+  return {2.0f, 0.5f};
+}
+template <>
+double2 GetScalar() {
+  return {2.0, 0.5};
+}
+
+// =================================================================================================
+
+// Rounding functions performing ceiling and division operations
+size_t CeilDiv(const size_t x, const size_t y) {
+  return 1 + ((x - 1) / y);
+}
+size_t Ceil(const size_t x, const size_t y) {
+  return CeilDiv(x,y)*y;
+}
+
+// Helper function to determine whether or not 'a' is a multiple of 'b'
+bool IsMultiple(const size_t a, const size_t b) {
+  return ((a/b)*b == a) ? true : false;
+};
+
+// =================================================================================================
+
+// Convert the precision enum (as integer) into bytes
+size_t GetBytes(const Precision precision) {
+  switch(precision) {
+    case Precision::kHalf: return 2;
+    case Precision::kSingle: return 4;
+    case Precision::kDouble: return 8;
+    case Precision::kComplexSingle: return 8;
+    case Precision::kComplexDouble: return 16;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
author	CNugteren <web@cedricnugteren.nl>	2015-05-30 12:30:43 +0200
committer	CNugteren <web@cedricnugteren.nl>	2015-05-30 12:30:43 +0200
commit	bc5a341dfe591946e925db315fc7d8c0c25c2938 (patch)
tree	b216ab5eee4863e3807d92b5ddd19fa22197ed22 /src
parent	c7b054ea6747039f4405fd93da6e924f3e5c7f4b (diff)