From 54a8723f8cd4f34a08d651216d680578ffc47fa5 Mon Sep 17 00:00:00 2001
From: CNugteren <web@cedricnugteren.nl>
Date: Mon, 12 Oct 2015 08:28:40 +0200
Subject: Moved level3 kernel files to a subfolder

---
 src/kernels/copy.opencl                |  73 ----
 src/kernels/level3/copy.opencl         |  73 ++++
 src/kernels/level3/pad.opencl          | 349 +++++++++++++++++
 src/kernels/level3/padtranspose.opencl | 164 ++++++++
 src/kernels/level3/transpose.opencl    | 149 +++++++
 src/kernels/level3/xgemm.opencl        | 683 +++++++++++++++++++++++++++++++++
 src/kernels/pad.opencl                 | 349 -----------------
 src/kernels/padtranspose.opencl        | 164 --------
 src/kernels/transpose.opencl           | 149 -------
 src/kernels/xgemm.opencl               | 683 ---------------------------------
 src/routines/level3/xgemm.cc           |  10 +-
 src/routines/level3/xher2k.cc          |  10 +-
 src/routines/level3/xherk.cc           |  10 +-
 src/routines/level3/xsyr2k.cc          |  10 +-
 src/routines/level3/xsyrk.cc           |  10 +-
 src/tuning/copy.cc                     |   2 +-
 src/tuning/pad.cc                      |   2 +-
 src/tuning/padtranspose.cc             |   2 +-
 src/tuning/transpose.cc                |   2 +-
 src/tuning/xgemm.cc                    |   2 +-
 20 files changed, 1448 insertions(+), 1448 deletions(-)
 delete mode 100644 src/kernels/copy.opencl
 create mode 100644 src/kernels/level3/copy.opencl
 create mode 100644 src/kernels/level3/pad.opencl
 create mode 100644 src/kernels/level3/padtranspose.opencl
 create mode 100644 src/kernels/level3/transpose.opencl
 create mode 100644 src/kernels/level3/xgemm.opencl
 delete mode 100644 src/kernels/pad.opencl
 delete mode 100644 src/kernels/padtranspose.opencl
 delete mode 100644 src/kernels/transpose.opencl
 delete mode 100644 src/kernels/xgemm.opencl

(limited to 'src')

diff --git a/src/kernels/copy.opencl b/src/kernels/copy.opencl
deleted file mode 100644
index 7dde688b..00000000
--- a/src/kernels/copy.opencl
+++ /dev/null
@@ -1,73 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains the common kernels shared among different BLAS routines. This file contains
-// kernels to copy matrices.
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef COPY_DIMX
-  #define COPY_DIMX 8      // Local workgroup size in the first dimension (x)
-#endif
-#ifndef COPY_DIMY
-  #define COPY_DIMY 8      // Local workgroup size in the second dimension (y)
-#endif
-#ifndef COPY_WPT
-  #define COPY_WPT 1       // Work per thread in the first dimension (x)
-#endif
-#ifndef COPY_VW
-  #define COPY_VW 1        // Vector width in the second dimension (y)
-#endif
-
-// =================================================================================================
-
-// Data-widths
-#if COPY_VW == 1
-  typedef real realC;
-#elif COPY_VW == 2
-  typedef real2 realC;
-#elif COPY_VW == 4
-  typedef real4 realC;
-#elif COPY_VW == 8
-  typedef real8 realC;
-#elif COPY_VW == 16
-  typedef real16 realC;
-#endif
-
-// =================================================================================================
-
-// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
-// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
-__kernel void CopyMatrix(const int ld,
-                         __global const realC* restrict src,
-                         __global realC* dest) {
-  #pragma unroll
-  for (int w_one=0; w_one<COPY_WPT; ++w_one) {
-    const int id_one = get_global_id(0);
-    const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
-    const int id = id_two*(ld/COPY_VW) + id_one;
-    dest[id] = src[id];
-  }
-}
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/kernels/level3/copy.opencl b/src/kernels/level3/copy.opencl
new file mode 100644
index 00000000..7dde688b
--- /dev/null
+++ b/src/kernels/level3/copy.opencl
@@ -0,0 +1,73 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS routines. This file contains
+// kernels to copy matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef COPY_DIMX
+  #define COPY_DIMX 8      // Local workgroup size in the first dimension (x)
+#endif
+#ifndef COPY_DIMY
+  #define COPY_DIMY 8      // Local workgroup size in the second dimension (y)
+#endif
+#ifndef COPY_WPT
+  #define COPY_WPT 1       // Work per thread in the first dimension (x)
+#endif
+#ifndef COPY_VW
+  #define COPY_VW 1        // Vector width in the second dimension (y)
+#endif
+
+// =================================================================================================
+
+// Data-widths
+#if COPY_VW == 1
+  typedef real realC;
+#elif COPY_VW == 2
+  typedef real2 realC;
+#elif COPY_VW == 4
+  typedef real4 realC;
+#elif COPY_VW == 8
+  typedef real8 realC;
+#elif COPY_VW == 16
+  typedef real16 realC;
+#endif
+
+// =================================================================================================
+
+// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
+// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
+__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+__kernel void CopyMatrix(const int ld,
+                         __global const realC* restrict src,
+                         __global realC* dest) {
+  #pragma unroll
+  for (int w_one=0; w_one<COPY_WPT; ++w_one) {
+    const int id_one = get_global_id(0);
+    const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
+    const int id = id_two*(ld/COPY_VW) + id_one;
+    dest[id] = src[id];
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/pad.opencl b/src/kernels/level3/pad.opencl
new file mode 100644
index 00000000..69324f20
--- /dev/null
+++ b/src/kernels/level3/pad.opencl
@@ -0,0 +1,349 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS routines. This file contains
+// kernels to copy and pad matrices in various ways, including:
+// 1) copying into a larger matrix by adding padding
+// 2) copying into a smaller matrix by removing padding
+// 3) from upper/lower triangle into a full matrix
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef PAD_DIMX
+  #define PAD_DIMX 8      // Local workgroup size in the first dimension (x)
+#endif
+#ifndef PAD_DIMY
+  #define PAD_DIMY 8      // Local workgroup size in the second dimension (y)
+#endif
+#ifndef PAD_WPTX
+  #define PAD_WPTX 1      // Work per thread in the first dimension (x)
+#endif
+#ifndef PAD_WPTY
+  #define PAD_WPTY 1      // Work per thread in the second dimension (y)
+#endif
+
+// =================================================================================================
+
+// Copies a matrix from source to destination. The output is padded with zero values in case the
+// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
+// value and offset can be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void PadMatrix(const int src_one, const int src_two,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_one, const int dest_two,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const int do_conjugate) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_two && id_one < dest_one) {
+
+        // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the
+        // value to be written to zero.
+        real value;
+        SetToZero(value);
+        if (id_two < src_two && id_one < src_one) {
+          value = src[id_two*src_ld + id_one + src_offset];
+        }
+
+        // Stores the value in the destination matrix
+        if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
+        dest[id_two*dest_ld + id_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
+// writes only the actual data back to the destination matrix. Again, the ld value and offset can
+// be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void UnPadMatrix(const int src_one, const int src_two,
+                          const int src_ld, const int src_offset,
+                          __global const real* restrict src,
+                          const int dest_one, const int dest_two,
+                          const int dest_ld, const int dest_offset,
+                          __global real* dest,
+                          const int upper, const int lower,
+                          const int diagonal_imag_zero) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      if (upper == 1) { condition = (id_two >= id_one); }
+      else if (lower == 1) { condition = (id_two <= id_one); }
+      if (condition) {
+
+        // Copies the value into the destination matrix. This is always within bounds of the source
+        // matrix, as we know that the destination matrix is smaller than the source.
+        if (id_two < dest_two && id_one < dest_one) {
+          real value = src[id_two*src_ld + id_one + src_offset];
+          if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
+          dest[id_two*dest_ld + id_one + dest_offset] = value;
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-symmetric matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-symmetric matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+#if PRECISION == 3232 || PRECISION == 6464
+
+// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void HermLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-hermitian matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) {
+            result = src[id_two*src_ld + id_one + src_offset];
+            if (id_one == id_two) { result.y = ZERO; }
+          }
+          else {
+            result = src[id_one*src_ld + id_two + src_offset];
+            COMPLEX_CONJUGATE(result);
+          }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void HermUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-hermitian matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) {
+            result = src[id_two*src_ld + id_one + src_offset];
+            if (id_one == id_two) { result.y = ZERO; }
+          }
+          else {
+            result = src[id_one*src_ld + id_two + src_offset];
+            COMPLEX_CONJUGATE(result);
+          }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+#endif
+// =================================================================================================
+
+// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void TrmmLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int unit_diagonal) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-triangular matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_two == id_one && unit_diagonal) { SetToOne(result); }
+          // Else: result is zero
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void TrmmUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int unit_diagonal) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-triangular matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_one == id_two && unit_diagonal) { SetToOne(result); }
+          // Else: result is zero
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/padtranspose.opencl b/src/kernels/level3/padtranspose.opencl
new file mode 100644
index 00000000..a6b70f0b
--- /dev/null
+++ b/src/kernels/level3/padtranspose.opencl
@@ -0,0 +1,164 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// kernels to transpose matrices in various ways, including:
+// 1) transposing into a larger matrix by adding padding
+// 2) transposing into a smaller matrix by removing padding
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef PADTRA_TILE
+  #define PADTRA_TILE 8   // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef PADTRA_WPT
+  #define PADTRA_WPT 1    // Amount of work per thread
+#endif
+#ifndef PADTRA_PAD
+  #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
+#endif
+
+// =================================================================================================
+
+// Same as PadCopyMatrix, but now also does the transpose
+__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+__kernel void PadTransposeMatrix(const int src_one, const int src_two,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_one, const int dest_two,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int do_conjugate) {
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the source matrix. Note that the local and global dimensions
+      // do not correspond to each other!
+      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
+      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
+
+      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
+      // Otherwise, set the local memory value to zero.
+      real value;
+      SetToZero(value);
+      if (id_src_two < src_two && id_src_one < src_one) {
+        value = src[id_src_two*src_ld + id_src_one + src_offset];
+      }
+      tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+    }
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the destination matrix
+      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
+      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
+
+      // Stores the transposed value in the destination matrix
+      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+        if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
+        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Same as UnPadCopyMatrix, but now also does the transpose
+__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+__kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
+                                   const int src_ld, const int src_offset,
+                                   __global const real* restrict src,
+                                   const int dest_one, const int dest_two,
+                                   const int dest_ld, const int dest_offset,
+                                   __global real* dest,
+                                   const int upper, const int lower,
+                                   const int diagonal_imag_zero) {
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the source matrix. Note that the local and global dimensions
+      // do not correspond to each other!
+      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
+      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
+
+      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
+      if ((id_src_one < src_one) && (id_src_two < src_two)) {
+        real value = src[id_src_two*src_ld + id_src_one + src_offset];
+        tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+      }
+    }
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the destination matrix
+      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
+      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
+
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
+      else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
+      if (condition) {
+
+        // Stores the transposed value in the destination matrix
+        if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+          real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+          if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
+          dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/transpose.opencl b/src/kernels/level3/transpose.opencl
new file mode 100644
index 00000000..d726f7ec
--- /dev/null
+++ b/src/kernels/level3/transpose.opencl
@@ -0,0 +1,149 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// kernels to transpose matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef TRA_DIM
+  #define TRA_DIM 8       // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef TRA_WPT
+  #define TRA_WPT 1       // Work per thread in one dimension and vector-width in the other
+#endif
+#ifndef TRA_PAD
+  #define TRA_PAD 0       // Padding of the local memory to avoid bank-conflicts
+#endif
+#ifndef TRA_SHUFFLE
+  #define TRA_SHUFFLE 0   // Shuffling of the global indices to avoid global memory bank-conflicts
+#endif
+
+// =================================================================================================
+
+// Data-widths
+#if TRA_WPT == 1
+  typedef real realT;
+#elif TRA_WPT == 2
+  typedef real2 realT;
+#elif TRA_WPT == 4
+  typedef real4 realT;
+#elif TRA_WPT == 8
+  typedef real8 realT;
+#elif TRA_WPT == 16
+  typedef real16 realT;
+#endif
+
+// =================================================================================================
+
+// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
+// offset. A more general version is available in 'padtranspose.opencl'.
+__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+__kernel void TransposeMatrix(const int ld,
+                              __global const realT* restrict src,
+                              __global realT* dest) {
+
+  // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
+  // way over workgroups, breaking memory-bank dependencies.
+  const int gid0 = get_group_id(0);
+  #if TRA_SHUFFLE == 1
+    const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
+  #else
+    const int gid1 = get_group_id(1);
+  #endif
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD];
+
+  // Loops over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
+
+    // Computes the identifiers for the source matrix. Note that the local and global dimensions
+    // do not correspond to each other!
+    const int id_one = gid1 * TRA_DIM + get_local_id(0);
+    const int id_two = (gid0 * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
+
+    // Loads data into the local memory
+    realT value = src[id_two*(ld/TRA_WPT) + id_one];
+    tile[get_local_id(0)*TRA_WPT + w_one][get_local_id(1)] = value;
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loads transposed data from the local memory
+  realT v[TRA_WPT];
+  #pragma unroll
+  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
+    v[w_one] = tile[get_local_id(1)*TRA_WPT + w_one][get_local_id(0)];
+  }
+
+  // Performs the register-level transpose of the vectorized data
+  realT results[TRA_WPT];
+  #if TRA_WPT == 1
+    results[0] = v[0];
+  #elif TRA_WPT == 2
+    results[0] = (realT) {v[0].x, v[1].x};
+    results[1] = (realT) {v[0].y, v[1].y};
+  #elif TRA_WPT == 4
+    results[0] = (realT) {v[0].x, v[1].x, v[2].x, v[3].x};
+    results[1] = (realT) {v[0].y, v[1].y, v[2].y, v[3].y};
+    results[2] = (realT) {v[0].z, v[1].z, v[2].z, v[3].z};
+    results[3] = (realT) {v[0].w, v[1].w, v[2].w, v[3].w};
+  #elif TRA_WPT == 8
+    results[0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0};
+    results[1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1};
+    results[2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2};
+    results[3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3};
+    results[4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4};
+    results[5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5};
+    results[6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6};
+    results[7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7};
+  #elif TRA_WPT == 16
+    results[ 0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0, v[8].s0, v[9].s0, v[10].s0, v[11].s0, v[12].s0, v[13].s0, v[14].s0, v[15].s0};
+    results[ 1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1, v[8].s1, v[9].s1, v[10].s1, v[11].s1, v[12].s1, v[13].s1, v[14].s1, v[15].s1};
+    results[ 2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2, v[8].s2, v[9].s2, v[10].s2, v[11].s2, v[12].s2, v[13].s2, v[14].s2, v[15].s2};
+    results[ 3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3, v[8].s3, v[9].s3, v[10].s3, v[11].s3, v[12].s3, v[13].s3, v[14].s3, v[15].s3};
+    results[ 4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4, v[8].s4, v[9].s4, v[10].s4, v[11].s4, v[12].s4, v[13].s4, v[14].s4, v[15].s4};
+    results[ 5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5, v[8].s5, v[9].s5, v[10].s5, v[11].s5, v[12].s5, v[13].s5, v[14].s5, v[15].s5};
+    results[ 6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6, v[8].s6, v[9].s6, v[10].s6, v[11].s6, v[12].s6, v[13].s6, v[14].s6, v[15].s6};
+    results[ 7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7, v[8].s7, v[9].s7, v[10].s7, v[11].s7, v[12].s7, v[13].s7, v[14].s7, v[15].s7};
+    results[ 8] = (realT) {v[0].s8, v[1].s8, v[2].s8, v[3].s8, v[4].s8, v[5].s8, v[6].s8, v[7].s8, v[8].s8, v[9].s8, v[10].s8, v[11].s8, v[12].s8, v[13].s8, v[14].s8, v[15].s8};
+    results[ 9] = (realT) {v[0].s9, v[1].s9, v[2].s9, v[3].s9, v[4].s9, v[5].s9, v[6].s9, v[7].s9, v[8].s9, v[9].s9, v[10].s9, v[11].s9, v[12].s9, v[13].s9, v[14].s9, v[15].s9};
+    results[10] = (realT) {v[0].sA, v[1].sA, v[2].sA, v[3].sA, v[4].sA, v[5].sA, v[6].sA, v[7].sA, v[8].sA, v[9].sA, v[10].sA, v[11].sA, v[12].sA, v[13].sA, v[14].sA, v[15].sA};
+    results[11] = (realT) {v[0].sB, v[1].sB, v[2].sB, v[3].sB, v[4].sB, v[5].sB, v[6].sB, v[7].sB, v[8].sB, v[9].sB, v[10].sB, v[11].sB, v[12].sB, v[13].sB, v[14].sB, v[15].sB};
+    results[12] = (realT) {v[0].sC, v[1].sC, v[2].sC, v[3].sC, v[4].sC, v[5].sC, v[6].sC, v[7].sC, v[8].sC, v[9].sC, v[10].sC, v[11].sC, v[12].sC, v[13].sC, v[14].sC, v[15].sC};
+    results[13] = (realT) {v[0].sD, v[1].sD, v[2].sD, v[3].sD, v[4].sD, v[5].sD, v[6].sD, v[7].sD, v[8].sD, v[9].sD, v[10].sD, v[11].sD, v[12].sD, v[13].sD, v[14].sD, v[15].sD};
+    results[14] = (realT) {v[0].sE, v[1].sE, v[2].sE, v[3].sE, v[4].sE, v[5].sE, v[6].sE, v[7].sE, v[8].sE, v[9].sE, v[10].sE, v[11].sE, v[12].sE, v[13].sE, v[14].sE, v[15].sE};
+    results[15] = (realT) {v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF};
+  #endif
+
+  // Stores the results into the destination matrix
+  #pragma unroll
+  for (int w_two=0; w_two<TRA_WPT; ++w_two) {
+    const int id_one = gid0*TRA_DIM + get_local_id(0);
+    const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
+    dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm.opencl b/src/kernels/level3/xgemm.opencl
new file mode 100644
index 00000000..8db0f557
--- /dev/null
+++ b/src/kernels/level3/xgemm.opencl
@@ -0,0 +1,683 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
+// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
+// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
+// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
+//
+// Matrices are accessed as follows:
+// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
+// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
+// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
+//
+// Or as an image (assuming column-major)
+//       K                      
+//    o-------o                 
+//    |       |                 
+//  N | [B^T] |                 
+//    |       |                 
+//    o-------o                 
+//        K               N     
+//    o-------o        o-----o  
+//  M |  [A]  |      M | [C] |  
+//    |       |        |     |  
+//    o-------o        o-----o  
+//                              
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef MWG
+  #define MWG 8      // Tile-size in dimension M (e.g. 64, 128)
+#endif
+#ifndef NWG
+  #define NWG 8      // Tile-size in dimension N (e.g. 64, 128)
+#endif
+#ifndef KWG
+  #define KWG 8      // Tile-size in dimension K (e.g. 8, 16)
+#endif
+#ifndef MDIMC
+  #define MDIMC 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef NDIMC
+  #define NDIMC 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef MDIMA
+  #define MDIMA 8    // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
+#endif
+#ifndef NDIMB
+  #define NDIMB 8    // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
+#endif
+#ifndef KWI
+  #define KWI 1      // Unroll factor of the KWG loop (smaller or equal than KWG)
+#endif
+#ifndef VWM
+  #define VWM 1      // Vector width of matrices A and C 
+#endif
+#ifndef VWN
+  #define VWN 1      // Vector width of matrix B
+#endif
+#ifndef STRM
+  #define STRM 0     // Use strided access within a thread in the M-dimension (1) or not (0)
+#endif
+#ifndef STRN
+  #define STRN 0     // Use strided access within a thread in the N-dimension (1) or not (0)
+#endif
+#ifndef SA
+  #define SA 0       // Use local/shared memory to cache matrix A (1) or not (0)
+#endif
+#ifndef SB
+  #define SB 0       // Use local/shared memory to cache matrix B (1) or not (0)
+#endif
+
+// Helper parameters based on the above tuning parameters
+#define MWI (MWG/MDIMC)               // Work per work-item (M-dimension)
+#define NWI (NWG/NDIMC)               // Work per work-item (N-dimension)
+#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
+#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
+#define MWA (MWG/MDIMA)               // Amount of loads-per-thread for matrix A (M-dimension)
+#define KWA (KWG/KDIMA)               // Amount of loads-per-thread for matrix A (K-dimension)
+#define KWB (KWG/KDIMB)               // Amount of loads-per-thread for matrix B (K-dimension)
+#define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
+
+// Settings
+#define USE_VECTOR_MAD 0              // Unroll (0) or don't (1) unroll the vector MAD manually
+
+// =================================================================================================
+
+// Data-widths in dimension M
+#if VWM == 1
+    typedef real realM;
+#elif VWM == 2
+    typedef real2 realM;
+#elif VWM == 4
+    typedef real4 realM;
+#elif VWM == 8
+    typedef real8 realM;
+#elif VWM == 16
+    typedef real16 realM;
+#endif
+
+// Data-widths in dimension N
+#if VWN == 1
+    typedef real realN;
+#elif VWN == 2
+    typedef real2 realN;
+#elif VWN == 4
+    typedef real4 realN;
+#elif VWN == 8
+    typedef real8 realN;
+#elif VWN == 16
+    typedef real16 realN;
+#endif
+
+// =================================================================================================
+
+// Initializes the accumulation registers to zero
+inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+    #pragma unroll
+    for (int ni=0; ni<NWI; ++ni) {
+      #if VWM == 1
+        SetToZero(cpm[ni][mi]);
+      #elif VWM == 2
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+      #elif VWM == 4
+        SetToZero(cpm[ni][mi].x);
+        SetToZero(cpm[ni][mi].y);
+        SetToZero(cpm[ni][mi].z);
+        SetToZero(cpm[ni][mi].w);
+      #elif VWM == 8
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+      #elif VWM == 16
+        SetToZero(cpm[ni][mi].s0);
+        SetToZero(cpm[ni][mi].s1);
+        SetToZero(cpm[ni][mi].s2);
+        SetToZero(cpm[ni][mi].s3);
+        SetToZero(cpm[ni][mi].s4);
+        SetToZero(cpm[ni][mi].s5);
+        SetToZero(cpm[ni][mi].s6);
+        SetToZero(cpm[ni][mi].s7);
+        SetToZero(cpm[ni][mi].s8);
+        SetToZero(cpm[ni][mi].s9);
+        SetToZero(cpm[ni][mi].sA);
+        SetToZero(cpm[ni][mi].sB);
+        SetToZero(cpm[ni][mi].sC);
+        SetToZero(cpm[ni][mi].sD);
+        SetToZero(cpm[ni][mi].sE);
+        SetToZero(cpm[ni][mi].sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix.
+#if SA == 1
+inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
+                           const int kSizeM, const int tid, const int kwg) {
+  const int la0 = tid % MDIMA;
+  const int la1 = tid / MDIMA;
+  #pragma unroll
+  for (int mia=0; mia<MWA/VWM; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWA; ++kia) {
+
+      // Computes the indices based on strided/non-strided access
+      #if STRM == 0
+        int mg = mia + la0*(MWA/VWM);
+      #elif STRM == 1
+        int mg = la0 + mia*MDIMA;
+      #endif
+
+      // Computes the indices for the global memory
+      int kg = kia + la1*KWA;
+      int idm = mg + get_group_id(0)*(MWG/VWM);
+      int idk = kg + kwg;
+
+      // Loads the data from global memory (not transposed) into the local memory
+      alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
+    }
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 1
+inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
+                           const int kSizeN, const int tid, const int kwg) {
+  const int lb0 = tid % NDIMB;
+  const int lb1 = tid / NDIMB;
+  #pragma unroll
+  for (int kib=0; kib<KWB; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWB/VWN; ++nib) {
+
+      // Computes the indices based on strided/non-strided access
+      #if STRN == 0
+        int ng = nib + lb0*(NWB/VWN);
+      #elif STRN == 1
+        int ng = lb0 + nib*NDIMB;
+      #endif
+
+      // Computes the indices for the global memory
+      int kg = kib + lb1*KWB;
+      int idn = ng + get_group_id(1)*(NWG/VWN);
+      int idk = kg + kwg;
+
+      // Loads the data from global memory (transposed) into the local memory
+      blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
+    }
+  }
+}
+#endif
+
+// =================================================================================================
+
+// Caches global off-chip memory directly into per-thread private memory (registers). This function
+// is specific for caching the A input matrix.
+#if SA == 0
+inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
+                             const int kSizeM, const int idk, const int kwg) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+
+    // Computes the indices based on strided/non-strided access
+    #if STRM == 0
+      int mg = mi + get_local_id(0)*(MWI/VWM);
+    #elif STRM == 1
+      int mg = get_local_id(0) + mi*MDIMC;
+    #endif
+
+    // Computes the indices for the global memory
+    int idm = mg + get_group_id(0)*(MWG/VWM);
+
+    // Loads the data from global memory (not transposed) and stores into registers
+    apm[mi] = agm[idk*(kSizeM/VWM) + idm];
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 0
+inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
+                             const int kSizeN, const int idk) {
+  #pragma unroll
+  for (int ni=0; ni<NWI/VWN; ++ni) {
+
+    // Computes the indices based on strided/non-strided access
+    #if STRN == 0
+      int ng = ni + get_local_id(1)*(NWI/VWN);
+    #elif STRN == 1
+      int ng = get_local_id(1) + ni*NDIMC;
+    #endif
+
+    // Computes the indices for the global memory
+    int idn = ng + get_group_id(1)*(NWG/VWN);
+
+    // Loads the data from global memory (transposed) and stores into registers
+    bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
+  }
+}
+#endif
+
+// =================================================================================================
+
+// Caches on-chip local memory into per-thread private memory (registers). This function is specific
+// for caching the A input matrix.
+#if SA == 1
+inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
+  #pragma unroll
+  for (int mi=0; mi<MWI/VWM; ++mi) {
+    #if STRM == 0
+      int mg = mi + get_local_id(0)*(MWI/VWM);
+    #elif STRM == 1
+      int mg = get_local_id(0) + mi*MDIMC;
+    #endif
+    apm[mi] = alm[kg*(MWG/VWM) + mg];
+  }
+}
+#endif
+
+// Same as above, but now for the B input matrix
+#if SB == 1
+inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
+  #pragma unroll
+  for (int ni=0; ni<NWI/VWN; ++ni) {
+    #if STRN == 0
+      int ng = ni + get_local_id(1)*(NWI/VWN);
+    #elif STRN == 1
+      int ng = get_local_id(1) + ni*NDIMC;
+    #endif
+    bpm[ni] = blm[kg*(NWG/VWN) + ng];
+  }
+}
+#endif
+
+// =================================================================================================
+
+// The vectorised multiply-add function
+inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
+  #if USE_VECTOR_MAD == 1
+    cvec += avec * bval;
+  #else
+    #if VWM == 1
+      MultiplyAdd(cvec,    avec,    bval);
+    #elif VWM == 2
+      MultiplyAdd(cvec.x , avec.x,  bval);
+      MultiplyAdd(cvec.y , avec.y,  bval);
+    #elif VWM == 4
+      MultiplyAdd(cvec.x , avec.x,  bval);
+      MultiplyAdd(cvec.y , avec.y,  bval);
+      MultiplyAdd(cvec.z , avec.z,  bval);
+      MultiplyAdd(cvec.w , avec.w,  bval);
+    #elif VWM == 8
+      MultiplyAdd(cvec.s0, avec.s0, bval);
+      MultiplyAdd(cvec.s1, avec.s1, bval);
+      MultiplyAdd(cvec.s2, avec.s2, bval);
+      MultiplyAdd(cvec.s3, avec.s3, bval);
+      MultiplyAdd(cvec.s4, avec.s4, bval);
+      MultiplyAdd(cvec.s5, avec.s5, bval);
+      MultiplyAdd(cvec.s6, avec.s6, bval);
+      MultiplyAdd(cvec.s7, avec.s7, bval);
+    #elif VWM == 16
+      MultiplyAdd(cvec.s0, avec.s0, bval);
+      MultiplyAdd(cvec.s1, avec.s1, bval);
+      MultiplyAdd(cvec.s2, avec.s2, bval);
+      MultiplyAdd(cvec.s3, avec.s3, bval);
+      MultiplyAdd(cvec.s4, avec.s4, bval);
+      MultiplyAdd(cvec.s5, avec.s5, bval);
+      MultiplyAdd(cvec.s6, avec.s6, bval);
+      MultiplyAdd(cvec.s7, avec.s7, bval);
+      MultiplyAdd(cvec.s8, avec.s8, bval);
+      MultiplyAdd(cvec.s9, avec.s9, bval);
+      MultiplyAdd(cvec.sA, avec.sA, bval);
+      MultiplyAdd(cvec.sB, avec.sB, bval);
+      MultiplyAdd(cvec.sC, avec.sC, bval);
+      MultiplyAdd(cvec.sD, avec.sD, bval);
+      MultiplyAdd(cvec.sE, avec.sE, bval);
+      MultiplyAdd(cvec.sF, avec.sF, bval);
+    #endif
+  #endif
+  return cvec;
+}
+
+// Performs the actual computation: Cpm += Apm * Bpm
+inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) {
+  #pragma unroll
+  for (int ni=0; ni<NWI/VWN; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWI/VWM; ++mi) {
+      #if VWN == 1
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni]);
+      #elif VWN == 2
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
+        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
+      #elif VWN == 4
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
+        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
+        cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].z);
+        cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].w);
+      #elif VWN == 8
+        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].s0);
+        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].s1);
+        cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].s2);
+        cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].s3);
+        cpm[ni*VWN + 4][mi] = MultiplyAddVector(cpm[ni*VWN + 4][mi], apm[mi], bpm[ni].s4);
+        cpm[ni*VWN + 5][mi] = MultiplyAddVector(cpm[ni*VWN + 5][mi], apm[mi], bpm[ni].s5);
+        cpm[ni*VWN + 6][mi] = MultiplyAddVector(cpm[ni*VWN + 6][mi], apm[mi], bpm[ni].s6);
+        cpm[ni*VWN + 7][mi] = MultiplyAddVector(cpm[ni*VWN + 7][mi], apm[mi], bpm[ni].s7);
+      #elif VWN == 16
+        cpm[ni*VWN + 0 ][mi] = MultiplyAddVector(cpm[ni*VWN + 0 ][mi], apm[mi], bpm[ni].s0);
+        cpm[ni*VWN + 1 ][mi] = MultiplyAddVector(cpm[ni*VWN + 1 ][mi], apm[mi], bpm[ni].s1);
+        cpm[ni*VWN + 2 ][mi] = MultiplyAddVector(cpm[ni*VWN + 2 ][mi], apm[mi], bpm[ni].s2);
+        cpm[ni*VWN + 3 ][mi] = MultiplyAddVector(cpm[ni*VWN + 3 ][mi], apm[mi], bpm[ni].s3);
+        cpm[ni*VWN + 4 ][mi] = MultiplyAddVector(cpm[ni*VWN + 4 ][mi], apm[mi], bpm[ni].s4);
+        cpm[ni*VWN + 5 ][mi] = MultiplyAddVector(cpm[ni*VWN + 5 ][mi], apm[mi], bpm[ni].s5);
+        cpm[ni*VWN + 6 ][mi] = MultiplyAddVector(cpm[ni*VWN + 6 ][mi], apm[mi], bpm[ni].s6);
+        cpm[ni*VWN + 7 ][mi] = MultiplyAddVector(cpm[ni*VWN + 7 ][mi], apm[mi], bpm[ni].s7);
+        cpm[ni*VWN + 8 ][mi] = MultiplyAddVector(cpm[ni*VWN + 8 ][mi], apm[mi], bpm[ni].s8);
+        cpm[ni*VWN + 9 ][mi] = MultiplyAddVector(cpm[ni*VWN + 9 ][mi], apm[mi], bpm[ni].s9);
+        cpm[ni*VWN + 10][mi] = MultiplyAddVector(cpm[ni*VWN + 10][mi], apm[mi], bpm[ni].sA);
+        cpm[ni*VWN + 11][mi] = MultiplyAddVector(cpm[ni*VWN + 11][mi], apm[mi], bpm[ni].sB);
+        cpm[ni*VWN + 12][mi] = MultiplyAddVector(cpm[ni*VWN + 12][mi], apm[mi], bpm[ni].sC);
+        cpm[ni*VWN + 13][mi] = MultiplyAddVector(cpm[ni*VWN + 13][mi], apm[mi], bpm[ni].sD);
+        cpm[ni*VWN + 14][mi] = MultiplyAddVector(cpm[ni*VWN + 14][mi], apm[mi], bpm[ni].sE);
+        cpm[ni*VWN + 15][mi] = MultiplyAddVector(cpm[ni*VWN + 15][mi], apm[mi], bpm[ni].sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
+                         const real alpha, const real beta) {
+  #pragma unroll
+  for (int ni=0; ni<NWI; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWI/VWM; ++mi) {
+      #if STRM == 0
+        int mg = mi + get_local_id(0)*(MWI/VWM);
+      #elif STRM == 1
+        int mg = get_local_id(0) + mi*MDIMC;
+      #endif
+      #if STRN == 0
+        int ng = ni + get_local_id(1)*NWI;
+      #elif STRN == 1
+        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
+      #endif
+      int idm = mg + get_group_id(0)*(MWG/VWM);
+      int idn = ng + get_group_id(1)*NWG;
+
+      // The final multiplication with alpha and the addition with beta*C
+      int index = idn*(kSizeM/VWM) + idm;
+      realM cval = cgm[index];
+      #if VWM == 1
+        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
+      #elif VWM == 2
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
+      #elif VWM == 4
+        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
+        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
+        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
+        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
+      #elif VWM == 8
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
+      #elif VWM == 16
+        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
+        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
+        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
+        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
+        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
+        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
+        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
+        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
+        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
+        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
+        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
+        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
+        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
+        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
+        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
+        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
+      #endif
+    }
+  }
+}
+
+// =================================================================================================
+
+// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+                      const __global realM* restrict agm, const __global realN* restrict bgm,
+                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
+                      #if SA == 1 && SB == 1
+                        , __local realM* alm, __local realN* blm
+                      #elif SA == 1
+                        , __local realM* alm
+                      #elif SB == 1
+                        , __local realN* blm
+                      #endif
+                      ) {
+
+  // Allocates workitem-private memory (registers)
+  realM apm[MWI/VWM];
+  realN bpm[NWI/VWN];
+
+  // Combined thread identifier (volatile to disable caching)
+  #if SA == 1 || SB == 1
+    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
+  #endif
+
+  // Initializes the accumulation registers
+  InitAccRegisters(cpm);
+
+  // Loops over all workgroup tiles
+  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
+
+    // Loads data: off-chip --> local (matrix A)
+    #if SA == 1
+      GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
+    #endif
+    // Loads data: off-chip --> local (matrix B)
+    #if SB == 1
+      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
+    #endif
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+
+    // Loops over all workitem tiles, unrolled by a factor KWI
+    for (int pwi=0; pwi<KWG; pwi+=KWI) {
+      #pragma unroll
+      for (int pit=0; pit<KWI; ++pit) {
+        #if SA == 0 || SB == 0
+          int idk = kwg + pwi + pit;
+        #endif
+        #if SA == 1 || SB == 1
+          int kg = pwi+pit;
+        #endif
+
+        // Loads data: local --> private (matrix A)
+        #if SA == 1
+          LocalToPrivateA(alm, apm, kg);
+        // Loads data: off-chip --> private (matrix A)
+        #else
+          GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+        #endif
+
+        // Loads data: local --> private (matrix B)
+        #if SB == 1
+          LocalToPrivateB(blm, bpm, kg);
+        // Loads data: off-chip --> private (matrix B)
+        #else
+          GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+        #endif
+
+        // Performs the accumulation (Cpm += Apm * Bpm)
+        MultiplyAccumulate(cpm, apm, bpm);
+      }
+    }
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+  }
+}
+
+// =================================================================================================
+// The upper-triangular and lower-triangular kernels are only used in special cases
+#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
+
+// Main entry point of the kernel. This is the upper-triangular version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
+                         const real alpha, const real beta,
+                         const __global realM* restrict agm,
+                         const __global realN* restrict bgm,
+                         __global realM* cgm) {
+
+  // Skip these threads if they do not contain threads contributing to the upper-triangle
+  if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// Main entry point of the kernel. This is the lower-triangular version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void XgemmLower(const int kSizeN, const int kSizeK,
+                         const real alpha, const real beta,
+                         const __global realM* restrict agm,
+                         const __global realN* restrict bgm,
+                         __global realM* cgm) {
+
+  // Skip these threads if they do not contain threads contributing to the lower-triangle
+  if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// =================================================================================================
+// If not using a triangular version, include the regular kernel
+#else
+
+// Main entry point of the kernel. This is the regular full version.
+__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+                    const real alpha, const real beta,
+                    const __global realM* restrict agm,
+                    const __global realN* restrict bgm,
+                    __global realM* cgm) {
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeM, alpha, beta);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/pad.opencl b/src/kernels/pad.opencl
deleted file mode 100644
index 69324f20..00000000
--- a/src/kernels/pad.opencl
+++ /dev/null
@@ -1,349 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains the common kernels shared among different BLAS routines. This file contains
-// kernels to copy and pad matrices in various ways, including:
-// 1) copying into a larger matrix by adding padding
-// 2) copying into a smaller matrix by removing padding
-// 3) from upper/lower triangle into a full matrix
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef PAD_DIMX
-  #define PAD_DIMX 8      // Local workgroup size in the first dimension (x)
-#endif
-#ifndef PAD_DIMY
-  #define PAD_DIMY 8      // Local workgroup size in the second dimension (y)
-#endif
-#ifndef PAD_WPTX
-  #define PAD_WPTX 1      // Work per thread in the first dimension (x)
-#endif
-#ifndef PAD_WPTY
-  #define PAD_WPTY 1      // Work per thread in the second dimension (y)
-#endif
-
-// =================================================================================================
-
-// Copies a matrix from source to destination. The output is padded with zero values in case the
-// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
-// value and offset can be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void PadMatrix(const int src_one, const int src_two,
-                        const int src_ld, const int src_offset,
-                        __global const real* restrict src,
-                        const int dest_one, const int dest_two,
-                        const int dest_ld, const int dest_offset,
-                        __global real* dest,
-                        const int do_conjugate) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_two && id_one < dest_one) {
-
-        // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the
-        // value to be written to zero.
-        real value;
-        SetToZero(value);
-        if (id_two < src_two && id_one < src_one) {
-          value = src[id_two*src_ld + id_one + src_offset];
-        }
-
-        // Stores the value in the destination matrix
-        if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-        dest[id_two*dest_ld + id_one + dest_offset] = value;
-      }
-    }
-  }
-}
-
-// =================================================================================================
-
-// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
-// writes only the actual data back to the destination matrix. Again, the ld value and offset can
-// be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void UnPadMatrix(const int src_one, const int src_two,
-                          const int src_ld, const int src_offset,
-                          __global const real* restrict src,
-                          const int dest_one, const int dest_two,
-                          const int dest_ld, const int dest_offset,
-                          __global real* dest,
-                          const int upper, const int lower,
-                          const int diagonal_imag_zero) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-
-      // Masking in case of triangular matrices: updates only the upper or lower part
-      bool condition = true;
-      if (upper == 1) { condition = (id_two >= id_one); }
-      else if (lower == 1) { condition = (id_two <= id_one); }
-      if (condition) {
-
-        // Copies the value into the destination matrix. This is always within bounds of the source
-        // matrix, as we know that the destination matrix is smaller than the source.
-        if (id_two < dest_two && id_one < dest_one) {
-          real value = src[id_two*src_ld + id_one + src_offset];
-          if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
-          dest[id_two*dest_ld + id_one + dest_offset] = value;
-        }
-      }
-    }
-  }
-}
-
-// =================================================================================================
-
-// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
-// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the lower-symmetric matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the upper-symmetric matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-// =================================================================================================
-#if PRECISION == 3232 || PRECISION == 6464
-
-// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
-// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the lower-hermitian matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) {
-            result = src[id_two*src_ld + id_one + src_offset];
-            if (id_one == id_two) { result.y = ZERO; }
-          }
-          else {
-            result = src[id_one*src_ld + id_two + src_offset];
-            COMPLEX_CONJUGATE(result);
-          }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the upper-hermitian matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) {
-            result = src[id_two*src_ld + id_one + src_offset];
-            if (id_one == id_two) { result.y = ZERO; }
-          }
-          else {
-            result = src[id_one*src_ld + id_two + src_offset];
-            COMPLEX_CONJUGATE(result);
-          }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-#endif
-// =================================================================================================
-
-// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
-// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TrmmLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the lower-triangular matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
-          if (id_two == id_one && unit_diagonal) { SetToOne(result); }
-          // Else: result is zero
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TrmmUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the upper-triangular matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
-          if (id_one == id_two && unit_diagonal) { SetToOne(result); }
-          // Else: result is zero
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/kernels/padtranspose.opencl b/src/kernels/padtranspose.opencl
deleted file mode 100644
index a6b70f0b..00000000
--- a/src/kernels/padtranspose.opencl
+++ /dev/null
@@ -1,164 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains the common kernels shared among different BLAS functions. This file contains
-// kernels to transpose matrices in various ways, including:
-// 1) transposing into a larger matrix by adding padding
-// 2) transposing into a smaller matrix by removing padding
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef PADTRA_TILE
-  #define PADTRA_TILE 8   // Number of local threads in the two dimensions (x,y)
-#endif
-#ifndef PADTRA_WPT
-  #define PADTRA_WPT 1    // Amount of work per thread
-#endif
-#ifndef PADTRA_PAD
-  #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
-#endif
-
-// =================================================================================================
-
-// Same as PadCopyMatrix, but now also does the transpose
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void PadTransposeMatrix(const int src_one, const int src_two,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_one, const int dest_two,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int do_conjugate) {
-
-  // Local memory to store a tile of the matrix (for coalescing)
-  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
-
-  // Loop over the work per thread
-  #pragma unroll
-  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
-    #pragma unroll
-    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
-
-      // Computes the identifiers for the source matrix. Note that the local and global dimensions
-      // do not correspond to each other!
-      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
-      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
-
-      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
-      // Otherwise, set the local memory value to zero.
-      real value;
-      SetToZero(value);
-      if (id_src_two < src_two && id_src_one < src_one) {
-        value = src[id_src_two*src_ld + id_src_one + src_offset];
-      }
-      tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
-    }
-  }
-
-  // Synchronizes all threads in a workgroup
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // Loop over the work per thread
-  #pragma unroll
-  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
-    #pragma unroll
-    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
-
-      // Computes the identifiers for the destination matrix
-      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
-      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
-
-      // Stores the transposed value in the destination matrix
-      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
-        if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
-      }
-    }
-  }
-}
-
-// =================================================================================================
-
-// Same as UnPadCopyMatrix, but now also does the transpose
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
-                                   const int src_ld, const int src_offset,
-                                   __global const real* restrict src,
-                                   const int dest_one, const int dest_two,
-                                   const int dest_ld, const int dest_offset,
-                                   __global real* dest,
-                                   const int upper, const int lower,
-                                   const int diagonal_imag_zero) {
-
-  // Local memory to store a tile of the matrix (for coalescing)
-  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
-
-  // Loop over the work per thread
-  #pragma unroll
-  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
-    #pragma unroll
-    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
-
-      // Computes the identifiers for the source matrix. Note that the local and global dimensions
-      // do not correspond to each other!
-      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
-      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
-
-      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
-      if ((id_src_one < src_one) && (id_src_two < src_two)) {
-        real value = src[id_src_two*src_ld + id_src_one + src_offset];
-        tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
-      }
-    }
-  }
-
-  // Synchronizes all threads in a workgroup
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // Loop over the work per thread
-  #pragma unroll
-  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
-    #pragma unroll
-    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
-
-      // Computes the identifiers for the destination matrix
-      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
-      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
-
-      // Masking in case of triangular matrices: updates only the upper or lower part
-      bool condition = true;
-      if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
-      else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
-      if (condition) {
-
-        // Stores the transposed value in the destination matrix
-        if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-          real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
-          if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
-          dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
-        }
-      }
-    }
-  }
-}
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/kernels/transpose.opencl b/src/kernels/transpose.opencl
deleted file mode 100644
index d726f7ec..00000000
--- a/src/kernels/transpose.opencl
+++ /dev/null
@@ -1,149 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains the common kernels shared among different BLAS functions. This file contains
-// kernels to transpose matrices.
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef TRA_DIM
-  #define TRA_DIM 8       // Number of local threads in the two dimensions (x,y)
-#endif
-#ifndef TRA_WPT
-  #define TRA_WPT 1       // Work per thread in one dimension and vector-width in the other
-#endif
-#ifndef TRA_PAD
-  #define TRA_PAD 0       // Padding of the local memory to avoid bank-conflicts
-#endif
-#ifndef TRA_SHUFFLE
-  #define TRA_SHUFFLE 0   // Shuffling of the global indices to avoid global memory bank-conflicts
-#endif
-
-// =================================================================================================
-
-// Data-widths
-#if TRA_WPT == 1
-  typedef real realT;
-#elif TRA_WPT == 2
-  typedef real2 realT;
-#elif TRA_WPT == 4
-  typedef real4 realT;
-#elif TRA_WPT == 8
-  typedef real8 realT;
-#elif TRA_WPT == 16
-  typedef real16 realT;
-#endif
-
-// =================================================================================================
-
-// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
-// offset. A more general version is available in 'padtranspose.opencl'.
-__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
-__kernel void TransposeMatrix(const int ld,
-                              __global const realT* restrict src,
-                              __global realT* dest) {
-
-  // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
-  // way over workgroups, breaking memory-bank dependencies.
-  const int gid0 = get_group_id(0);
-  #if TRA_SHUFFLE == 1
-    const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
-  #else
-    const int gid1 = get_group_id(1);
-  #endif
-
-  // Local memory to store a tile of the matrix (for coalescing)
-  __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD];
-
-  // Loops over the work per thread
-  #pragma unroll
-  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
-
-    // Computes the identifiers for the source matrix. Note that the local and global dimensions
-    // do not correspond to each other!
-    const int id_one = gid1 * TRA_DIM + get_local_id(0);
-    const int id_two = (gid0 * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
-
-    // Loads data into the local memory
-    realT value = src[id_two*(ld/TRA_WPT) + id_one];
-    tile[get_local_id(0)*TRA_WPT + w_one][get_local_id(1)] = value;
-  }
-
-  // Synchronizes all threads in a workgroup
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // Loads transposed data from the local memory
-  realT v[TRA_WPT];
-  #pragma unroll
-  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
-    v[w_one] = tile[get_local_id(1)*TRA_WPT + w_one][get_local_id(0)];
-  }
-
-  // Performs the register-level transpose of the vectorized data
-  realT results[TRA_WPT];
-  #if TRA_WPT == 1
-    results[0] = v[0];
-  #elif TRA_WPT == 2
-    results[0] = (realT) {v[0].x, v[1].x};
-    results[1] = (realT) {v[0].y, v[1].y};
-  #elif TRA_WPT == 4
-    results[0] = (realT) {v[0].x, v[1].x, v[2].x, v[3].x};
-    results[1] = (realT) {v[0].y, v[1].y, v[2].y, v[3].y};
-    results[2] = (realT) {v[0].z, v[1].z, v[2].z, v[3].z};
-    results[3] = (realT) {v[0].w, v[1].w, v[2].w, v[3].w};
-  #elif TRA_WPT == 8
-    results[0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0};
-    results[1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1};
-    results[2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2};
-    results[3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3};
-    results[4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4};
-    results[5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5};
-    results[6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6};
-    results[7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7};
-  #elif TRA_WPT == 16
-    results[ 0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0, v[8].s0, v[9].s0, v[10].s0, v[11].s0, v[12].s0, v[13].s0, v[14].s0, v[15].s0};
-    results[ 1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1, v[8].s1, v[9].s1, v[10].s1, v[11].s1, v[12].s1, v[13].s1, v[14].s1, v[15].s1};
-    results[ 2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2, v[8].s2, v[9].s2, v[10].s2, v[11].s2, v[12].s2, v[13].s2, v[14].s2, v[15].s2};
-    results[ 3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3, v[8].s3, v[9].s3, v[10].s3, v[11].s3, v[12].s3, v[13].s3, v[14].s3, v[15].s3};
-    results[ 4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4, v[8].s4, v[9].s4, v[10].s4, v[11].s4, v[12].s4, v[13].s4, v[14].s4, v[15].s4};
-    results[ 5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5, v[8].s5, v[9].s5, v[10].s5, v[11].s5, v[12].s5, v[13].s5, v[14].s5, v[15].s5};
-    results[ 6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6, v[8].s6, v[9].s6, v[10].s6, v[11].s6, v[12].s6, v[13].s6, v[14].s6, v[15].s6};
-    results[ 7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7, v[8].s7, v[9].s7, v[10].s7, v[11].s7, v[12].s7, v[13].s7, v[14].s7, v[15].s7};
-    results[ 8] = (realT) {v[0].s8, v[1].s8, v[2].s8, v[3].s8, v[4].s8, v[5].s8, v[6].s8, v[7].s8, v[8].s8, v[9].s8, v[10].s8, v[11].s8, v[12].s8, v[13].s8, v[14].s8, v[15].s8};
-    results[ 9] = (realT) {v[0].s9, v[1].s9, v[2].s9, v[3].s9, v[4].s9, v[5].s9, v[6].s9, v[7].s9, v[8].s9, v[9].s9, v[10].s9, v[11].s9, v[12].s9, v[13].s9, v[14].s9, v[15].s9};
-    results[10] = (realT) {v[0].sA, v[1].sA, v[2].sA, v[3].sA, v[4].sA, v[5].sA, v[6].sA, v[7].sA, v[8].sA, v[9].sA, v[10].sA, v[11].sA, v[12].sA, v[13].sA, v[14].sA, v[15].sA};
-    results[11] = (realT) {v[0].sB, v[1].sB, v[2].sB, v[3].sB, v[4].sB, v[5].sB, v[6].sB, v[7].sB, v[8].sB, v[9].sB, v[10].sB, v[11].sB, v[12].sB, v[13].sB, v[14].sB, v[15].sB};
-    results[12] = (realT) {v[0].sC, v[1].sC, v[2].sC, v[3].sC, v[4].sC, v[5].sC, v[6].sC, v[7].sC, v[8].sC, v[9].sC, v[10].sC, v[11].sC, v[12].sC, v[13].sC, v[14].sC, v[15].sC};
-    results[13] = (realT) {v[0].sD, v[1].sD, v[2].sD, v[3].sD, v[4].sD, v[5].sD, v[6].sD, v[7].sD, v[8].sD, v[9].sD, v[10].sD, v[11].sD, v[12].sD, v[13].sD, v[14].sD, v[15].sD};
-    results[14] = (realT) {v[0].sE, v[1].sE, v[2].sE, v[3].sE, v[4].sE, v[5].sE, v[6].sE, v[7].sE, v[8].sE, v[9].sE, v[10].sE, v[11].sE, v[12].sE, v[13].sE, v[14].sE, v[15].sE};
-    results[15] = (realT) {v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF};
-  #endif
-
-  // Stores the results into the destination matrix
-  #pragma unroll
-  for (int w_two=0; w_two<TRA_WPT; ++w_two) {
-    const int id_one = gid0*TRA_DIM + get_local_id(0);
-    const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
-    dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
-  }
-}
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/kernels/xgemm.opencl b/src/kernels/xgemm.opencl
deleted file mode 100644
index 8db0f557..00000000
--- a/src/kernels/xgemm.opencl
+++ /dev/null
@@ -1,683 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
-// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
-// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
-// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
-//
-// Matrices are accessed as follows:
-// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
-// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
-// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
-//
-// Or as an image (assuming column-major)
-//       K                      
-//    o-------o                 
-//    |       |                 
-//  N | [B^T] |                 
-//    |       |                 
-//    o-------o                 
-//        K               N     
-//    o-------o        o-----o  
-//  M |  [A]  |      M | [C] |  
-//    |       |        |     |  
-//    o-------o        o-----o  
-//                              
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef MWG
-  #define MWG 8      // Tile-size in dimension M (e.g. 64, 128)
-#endif
-#ifndef NWG
-  #define NWG 8      // Tile-size in dimension N (e.g. 64, 128)
-#endif
-#ifndef KWG
-  #define KWG 8      // Tile-size in dimension K (e.g. 8, 16)
-#endif
-#ifndef MDIMC
-  #define MDIMC 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
-#endif
-#ifndef NDIMC
-  #define NDIMC 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
-#endif
-#ifndef MDIMA
-  #define MDIMA 8    // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
-#endif
-#ifndef NDIMB
-  #define NDIMB 8    // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
-#endif
-#ifndef KWI
-  #define KWI 1      // Unroll factor of the KWG loop (smaller or equal than KWG)
-#endif
-#ifndef VWM
-  #define VWM 1      // Vector width of matrices A and C 
-#endif
-#ifndef VWN
-  #define VWN 1      // Vector width of matrix B
-#endif
-#ifndef STRM
-  #define STRM 0     // Use strided access within a thread in the M-dimension (1) or not (0)
-#endif
-#ifndef STRN
-  #define STRN 0     // Use strided access within a thread in the N-dimension (1) or not (0)
-#endif
-#ifndef SA
-  #define SA 0       // Use local/shared memory to cache matrix A (1) or not (0)
-#endif
-#ifndef SB
-  #define SB 0       // Use local/shared memory to cache matrix B (1) or not (0)
-#endif
-
-// Helper parameters based on the above tuning parameters
-#define MWI (MWG/MDIMC)               // Work per work-item (M-dimension)
-#define NWI (NWG/NDIMC)               // Work per work-item (N-dimension)
-#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
-#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
-#define MWA (MWG/MDIMA)               // Amount of loads-per-thread for matrix A (M-dimension)
-#define KWA (KWG/KDIMA)               // Amount of loads-per-thread for matrix A (K-dimension)
-#define KWB (KWG/KDIMB)               // Amount of loads-per-thread for matrix B (K-dimension)
-#define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
-
-// Settings
-#define USE_VECTOR_MAD 0              // Unroll (0) or don't (1) unroll the vector MAD manually
-
-// =================================================================================================
-
-// Data-widths in dimension M
-#if VWM == 1
-    typedef real realM;
-#elif VWM == 2
-    typedef real2 realM;
-#elif VWM == 4
-    typedef real4 realM;
-#elif VWM == 8
-    typedef real8 realM;
-#elif VWM == 16
-    typedef real16 realM;
-#endif
-
-// Data-widths in dimension N
-#if VWN == 1
-    typedef real realN;
-#elif VWN == 2
-    typedef real2 realN;
-#elif VWN == 4
-    typedef real4 realN;
-#elif VWN == 8
-    typedef real8 realN;
-#elif VWN == 16
-    typedef real16 realN;
-#endif
-
-// =================================================================================================
-
-// Initializes the accumulation registers to zero
-inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
-  #pragma unroll
-  for (int mi=0; mi<MWI/VWM; ++mi) {
-    #pragma unroll
-    for (int ni=0; ni<NWI; ++ni) {
-      #if VWM == 1
-        SetToZero(cpm[ni][mi]);
-      #elif VWM == 2
-        SetToZero(cpm[ni][mi].x);
-        SetToZero(cpm[ni][mi].y);
-      #elif VWM == 4
-        SetToZero(cpm[ni][mi].x);
-        SetToZero(cpm[ni][mi].y);
-        SetToZero(cpm[ni][mi].z);
-        SetToZero(cpm[ni][mi].w);
-      #elif VWM == 8
-        SetToZero(cpm[ni][mi].s0);
-        SetToZero(cpm[ni][mi].s1);
-        SetToZero(cpm[ni][mi].s2);
-        SetToZero(cpm[ni][mi].s3);
-        SetToZero(cpm[ni][mi].s4);
-        SetToZero(cpm[ni][mi].s5);
-        SetToZero(cpm[ni][mi].s6);
-        SetToZero(cpm[ni][mi].s7);
-      #elif VWM == 16
-        SetToZero(cpm[ni][mi].s0);
-        SetToZero(cpm[ni][mi].s1);
-        SetToZero(cpm[ni][mi].s2);
-        SetToZero(cpm[ni][mi].s3);
-        SetToZero(cpm[ni][mi].s4);
-        SetToZero(cpm[ni][mi].s5);
-        SetToZero(cpm[ni][mi].s6);
-        SetToZero(cpm[ni][mi].s7);
-        SetToZero(cpm[ni][mi].s8);
-        SetToZero(cpm[ni][mi].s9);
-        SetToZero(cpm[ni][mi].sA);
-        SetToZero(cpm[ni][mi].sB);
-        SetToZero(cpm[ni][mi].sC);
-        SetToZero(cpm[ni][mi].sD);
-        SetToZero(cpm[ni][mi].sE);
-        SetToZero(cpm[ni][mi].sF);
-      #endif
-    }
-  }
-}
-
-// =================================================================================================
-
-// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
-// caching the A input matrix.
-#if SA == 1
-inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
-                           const int kSizeM, const int tid, const int kwg) {
-  const int la0 = tid % MDIMA;
-  const int la1 = tid / MDIMA;
-  #pragma unroll
-  for (int mia=0; mia<MWA/VWM; ++mia) {
-    #pragma unroll
-    for (int kia=0; kia<KWA; ++kia) {
-
-      // Computes the indices based on strided/non-strided access
-      #if STRM == 0
-        int mg = mia + la0*(MWA/VWM);
-      #elif STRM == 1
-        int mg = la0 + mia*MDIMA;
-      #endif
-
-      // Computes the indices for the global memory
-      int kg = kia + la1*KWA;
-      int idm = mg + get_group_id(0)*(MWG/VWM);
-      int idk = kg + kwg;
-
-      // Loads the data from global memory (not transposed) into the local memory
-      alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
-    }
-  }
-}
-#endif
-
-// Same as above, but now for the B input matrix
-#if SB == 1
-inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
-                           const int kSizeN, const int tid, const int kwg) {
-  const int lb0 = tid % NDIMB;
-  const int lb1 = tid / NDIMB;
-  #pragma unroll
-  for (int kib=0; kib<KWB; ++kib) {
-    #pragma unroll
-    for (int nib=0; nib<NWB/VWN; ++nib) {
-
-      // Computes the indices based on strided/non-strided access
-      #if STRN == 0
-        int ng = nib + lb0*(NWB/VWN);
-      #elif STRN == 1
-        int ng = lb0 + nib*NDIMB;
-      #endif
-
-      // Computes the indices for the global memory
-      int kg = kib + lb1*KWB;
-      int idn = ng + get_group_id(1)*(NWG/VWN);
-      int idk = kg + kwg;
-
-      // Loads the data from global memory (transposed) into the local memory
-      blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
-    }
-  }
-}
-#endif
-
-// =================================================================================================
-
-// Caches global off-chip memory directly into per-thread private memory (registers). This function
-// is specific for caching the A input matrix.
-#if SA == 0
-inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
-                             const int kSizeM, const int idk, const int kwg) {
-  #pragma unroll
-  for (int mi=0; mi<MWI/VWM; ++mi) {
-
-    // Computes the indices based on strided/non-strided access
-    #if STRM == 0
-      int mg = mi + get_local_id(0)*(MWI/VWM);
-    #elif STRM == 1
-      int mg = get_local_id(0) + mi*MDIMC;
-    #endif
-
-    // Computes the indices for the global memory
-    int idm = mg + get_group_id(0)*(MWG/VWM);
-
-    // Loads the data from global memory (not transposed) and stores into registers
-    apm[mi] = agm[idk*(kSizeM/VWM) + idm];
-  }
-}
-#endif
-
-// Same as above, but now for the B input matrix
-#if SB == 0
-inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
-                             const int kSizeN, const int idk) {
-  #pragma unroll
-  for (int ni=0; ni<NWI/VWN; ++ni) {
-
-    // Computes the indices based on strided/non-strided access
-    #if STRN == 0
-      int ng = ni + get_local_id(1)*(NWI/VWN);
-    #elif STRN == 1
-      int ng = get_local_id(1) + ni*NDIMC;
-    #endif
-
-    // Computes the indices for the global memory
-    int idn = ng + get_group_id(1)*(NWG/VWN);
-
-    // Loads the data from global memory (transposed) and stores into registers
-    bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
-  }
-}
-#endif
-
-// =================================================================================================
-
-// Caches on-chip local memory into per-thread private memory (registers). This function is specific
-// for caching the A input matrix.
-#if SA == 1
-inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
-  #pragma unroll
-  for (int mi=0; mi<MWI/VWM; ++mi) {
-    #if STRM == 0
-      int mg = mi + get_local_id(0)*(MWI/VWM);
-    #elif STRM == 1
-      int mg = get_local_id(0) + mi*MDIMC;
-    #endif
-    apm[mi] = alm[kg*(MWG/VWM) + mg];
-  }
-}
-#endif
-
-// Same as above, but now for the B input matrix
-#if SB == 1
-inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
-  #pragma unroll
-  for (int ni=0; ni<NWI/VWN; ++ni) {
-    #if STRN == 0
-      int ng = ni + get_local_id(1)*(NWI/VWN);
-    #elif STRN == 1
-      int ng = get_local_id(1) + ni*NDIMC;
-    #endif
-    bpm[ni] = blm[kg*(NWG/VWN) + ng];
-  }
-}
-#endif
-
-// =================================================================================================
-
-// The vectorised multiply-add function
-inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
-  #if USE_VECTOR_MAD == 1
-    cvec += avec * bval;
-  #else
-    #if VWM == 1
-      MultiplyAdd(cvec,    avec,    bval);
-    #elif VWM == 2
-      MultiplyAdd(cvec.x , avec.x,  bval);
-      MultiplyAdd(cvec.y , avec.y,  bval);
-    #elif VWM == 4
-      MultiplyAdd(cvec.x , avec.x,  bval);
-      MultiplyAdd(cvec.y , avec.y,  bval);
-      MultiplyAdd(cvec.z , avec.z,  bval);
-      MultiplyAdd(cvec.w , avec.w,  bval);
-    #elif VWM == 8
-      MultiplyAdd(cvec.s0, avec.s0, bval);
-      MultiplyAdd(cvec.s1, avec.s1, bval);
-      MultiplyAdd(cvec.s2, avec.s2, bval);
-      MultiplyAdd(cvec.s3, avec.s3, bval);
-      MultiplyAdd(cvec.s4, avec.s4, bval);
-      MultiplyAdd(cvec.s5, avec.s5, bval);
-      MultiplyAdd(cvec.s6, avec.s6, bval);
-      MultiplyAdd(cvec.s7, avec.s7, bval);
-    #elif VWM == 16
-      MultiplyAdd(cvec.s0, avec.s0, bval);
-      MultiplyAdd(cvec.s1, avec.s1, bval);
-      MultiplyAdd(cvec.s2, avec.s2, bval);
-      MultiplyAdd(cvec.s3, avec.s3, bval);
-      MultiplyAdd(cvec.s4, avec.s4, bval);
-      MultiplyAdd(cvec.s5, avec.s5, bval);
-      MultiplyAdd(cvec.s6, avec.s6, bval);
-      MultiplyAdd(cvec.s7, avec.s7, bval);
-      MultiplyAdd(cvec.s8, avec.s8, bval);
-      MultiplyAdd(cvec.s9, avec.s9, bval);
-      MultiplyAdd(cvec.sA, avec.sA, bval);
-      MultiplyAdd(cvec.sB, avec.sB, bval);
-      MultiplyAdd(cvec.sC, avec.sC, bval);
-      MultiplyAdd(cvec.sD, avec.sD, bval);
-      MultiplyAdd(cvec.sE, avec.sE, bval);
-      MultiplyAdd(cvec.sF, avec.sF, bval);
-    #endif
-  #endif
-  return cvec;
-}
-
-// Performs the actual computation: Cpm += Apm * Bpm
-inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) {
-  #pragma unroll
-  for (int ni=0; ni<NWI/VWN; ++ni) {
-    #pragma unroll
-    for (int mi=0; mi<MWI/VWM; ++mi) {
-      #if VWN == 1
-        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni]);
-      #elif VWN == 2
-        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
-        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
-      #elif VWN == 4
-        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
-        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
-        cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].z);
-        cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].w);
-      #elif VWN == 8
-        cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].s0);
-        cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].s1);
-        cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].s2);
-        cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].s3);
-        cpm[ni*VWN + 4][mi] = MultiplyAddVector(cpm[ni*VWN + 4][mi], apm[mi], bpm[ni].s4);
-        cpm[ni*VWN + 5][mi] = MultiplyAddVector(cpm[ni*VWN + 5][mi], apm[mi], bpm[ni].s5);
-        cpm[ni*VWN + 6][mi] = MultiplyAddVector(cpm[ni*VWN + 6][mi], apm[mi], bpm[ni].s6);
-        cpm[ni*VWN + 7][mi] = MultiplyAddVector(cpm[ni*VWN + 7][mi], apm[mi], bpm[ni].s7);
-      #elif VWN == 16
-        cpm[ni*VWN + 0 ][mi] = MultiplyAddVector(cpm[ni*VWN + 0 ][mi], apm[mi], bpm[ni].s0);
-        cpm[ni*VWN + 1 ][mi] = MultiplyAddVector(cpm[ni*VWN + 1 ][mi], apm[mi], bpm[ni].s1);
-        cpm[ni*VWN + 2 ][mi] = MultiplyAddVector(cpm[ni*VWN + 2 ][mi], apm[mi], bpm[ni].s2);
-        cpm[ni*VWN + 3 ][mi] = MultiplyAddVector(cpm[ni*VWN + 3 ][mi], apm[mi], bpm[ni].s3);
-        cpm[ni*VWN + 4 ][mi] = MultiplyAddVector(cpm[ni*VWN + 4 ][mi], apm[mi], bpm[ni].s4);
-        cpm[ni*VWN + 5 ][mi] = MultiplyAddVector(cpm[ni*VWN + 5 ][mi], apm[mi], bpm[ni].s5);
-        cpm[ni*VWN + 6 ][mi] = MultiplyAddVector(cpm[ni*VWN + 6 ][mi], apm[mi], bpm[ni].s6);
-        cpm[ni*VWN + 7 ][mi] = MultiplyAddVector(cpm[ni*VWN + 7 ][mi], apm[mi], bpm[ni].s7);
-        cpm[ni*VWN + 8 ][mi] = MultiplyAddVector(cpm[ni*VWN + 8 ][mi], apm[mi], bpm[ni].s8);
-        cpm[ni*VWN + 9 ][mi] = MultiplyAddVector(cpm[ni*VWN + 9 ][mi], apm[mi], bpm[ni].s9);
-        cpm[ni*VWN + 10][mi] = MultiplyAddVector(cpm[ni*VWN + 10][mi], apm[mi], bpm[ni].sA);
-        cpm[ni*VWN + 11][mi] = MultiplyAddVector(cpm[ni*VWN + 11][mi], apm[mi], bpm[ni].sB);
-        cpm[ni*VWN + 12][mi] = MultiplyAddVector(cpm[ni*VWN + 12][mi], apm[mi], bpm[ni].sC);
-        cpm[ni*VWN + 13][mi] = MultiplyAddVector(cpm[ni*VWN + 13][mi], apm[mi], bpm[ni].sD);
-        cpm[ni*VWN + 14][mi] = MultiplyAddVector(cpm[ni*VWN + 14][mi], apm[mi], bpm[ni].sE);
-        cpm[ni*VWN + 15][mi] = MultiplyAddVector(cpm[ni*VWN + 15][mi], apm[mi], bpm[ni].sF);
-      #endif
-    }
-  }
-}
-
-// =================================================================================================
-
-// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
-// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
-inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
-                         const real alpha, const real beta) {
-  #pragma unroll
-  for (int ni=0; ni<NWI; ++ni) {
-    #pragma unroll
-    for (int mi=0; mi<MWI/VWM; ++mi) {
-      #if STRM == 0
-        int mg = mi + get_local_id(0)*(MWI/VWM);
-      #elif STRM == 1
-        int mg = get_local_id(0) + mi*MDIMC;
-      #endif
-      #if STRN == 0
-        int ng = ni + get_local_id(1)*NWI;
-      #elif STRN == 1
-        int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
-      #endif
-      int idm = mg + get_group_id(0)*(MWG/VWM);
-      int idn = ng + get_group_id(1)*NWG;
-
-      // The final multiplication with alpha and the addition with beta*C
-      int index = idn*(kSizeM/VWM) + idm;
-      realM cval = cgm[index];
-      #if VWM == 1
-        AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
-      #elif VWM == 2
-        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
-        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
-      #elif VWM == 4
-        AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
-        AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
-        AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
-        AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
-      #elif VWM == 8
-        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
-        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
-        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
-        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
-        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
-        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
-        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
-        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
-      #elif VWM == 16
-        AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
-        AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
-        AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
-        AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
-        AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
-        AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
-        AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
-        AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
-        AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
-        AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
-        AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
-        AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
-        AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
-        AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
-        AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
-        AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
-      #endif
-    }
-  }
-}
-
-// =================================================================================================
-
-// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
-inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
-                      const __global realM* restrict agm, const __global realN* restrict bgm,
-                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
-                      #if SA == 1 && SB == 1
-                        , __local realM* alm, __local realN* blm
-                      #elif SA == 1
-                        , __local realM* alm
-                      #elif SB == 1
-                        , __local realN* blm
-                      #endif
-                      ) {
-
-  // Allocates workitem-private memory (registers)
-  realM apm[MWI/VWM];
-  realN bpm[NWI/VWN];
-
-  // Combined thread identifier (volatile to disable caching)
-  #if SA == 1 || SB == 1
-    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
-  #endif
-
-  // Initializes the accumulation registers
-  InitAccRegisters(cpm);
-
-  // Loops over all workgroup tiles
-  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
-
-    // Loads data: off-chip --> local (matrix A)
-    #if SA == 1
-      GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
-    #endif
-    // Loads data: off-chip --> local (matrix B)
-    #if SB == 1
-      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
-    #endif
-    #if SA == 1 || SB == 1
-      barrier(CLK_LOCAL_MEM_FENCE);
-    #endif
-
-    // Loops over all workitem tiles, unrolled by a factor KWI
-    for (int pwi=0; pwi<KWG; pwi+=KWI) {
-      #pragma unroll
-      for (int pit=0; pit<KWI; ++pit) {
-        #if SA == 0 || SB == 0
-          int idk = kwg + pwi + pit;
-        #endif
-        #if SA == 1 || SB == 1
-          int kg = pwi+pit;
-        #endif
-
-        // Loads data: local --> private (matrix A)
-        #if SA == 1
-          LocalToPrivateA(alm, apm, kg);
-        // Loads data: off-chip --> private (matrix A)
-        #else
-          GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
-        #endif
-
-        // Loads data: local --> private (matrix B)
-        #if SB == 1
-          LocalToPrivateB(blm, bpm, kg);
-        // Loads data: off-chip --> private (matrix B)
-        #else
-          GlobalToPrivateB(bgm, bpm, kSizeN, idk);
-        #endif
-
-        // Performs the accumulation (Cpm += Apm * Bpm)
-        MultiplyAccumulate(cpm, apm, bpm);
-      }
-    }
-    #if SA == 1 || SB == 1
-      barrier(CLK_LOCAL_MEM_FENCE);
-    #endif
-  }
-}
-
-// =================================================================================================
-// The upper-triangular and lower-triangular kernels are only used in special cases
-#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
-
-// Main entry point of the kernel. This is the upper-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
-                         const real alpha, const real beta,
-                         const __global realM* restrict agm,
-                         const __global realN* restrict bgm,
-                         __global realM* cgm) {
-
-  // Skip these threads if they do not contain threads contributing to the upper-triangle
-  if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
-    return;
-  }
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// Main entry point of the kernel. This is the lower-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmLower(const int kSizeN, const int kSizeK,
-                         const real alpha, const real beta,
-                         const __global realM* restrict agm,
-                         const __global realN* restrict bgm,
-                         __global realM* cgm) {
-
-  // Skip these threads if they do not contain threads contributing to the lower-triangle
-  if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
-    return;
-  }
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// =================================================================================================
-// If not using a triangular version, include the regular kernel
-#else
-
-// Main entry point of the kernel. This is the regular full version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const real alpha, const real beta,
-                    const __global realM* restrict agm,
-                    const __global realN* restrict bgm,
-                    __global realM* cgm) {
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeM, alpha, beta);
-}
-
-#endif
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index 525a82e6..372a407b 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -32,11 +32,11 @@ template <typename T>
 Xgemm<T>::Xgemm(Queue &queue, Event &event):
     Routine<T>(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
   source_string_ =
-    #include "../../kernels/copy.opencl"
-    #include "../../kernels/pad.opencl"
-    #include "../../kernels/transpose.opencl"
-    #include "../../kernels/padtranspose.opencl"
-    #include "../../kernels/xgemm.opencl"
+    #include "../../kernels/level3/copy.opencl"
+    #include "../../kernels/level3/pad.opencl"
+    #include "../../kernels/level3/transpose.opencl"
+    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/xgemm.opencl"
   ;
 }
 
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
index 29b2f733..11537d20 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@@ -30,11 +30,11 @@ template <typename T, typename U>
 Xher2k<T,U>::Xher2k(Queue &queue, Event &event):
     Routine<T>(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
   source_string_ =
-    #include "../../kernels/copy.opencl"
-    #include "../../kernels/pad.opencl"
-    #include "../../kernels/transpose.opencl"
-    #include "../../kernels/padtranspose.opencl"
-    #include "../../kernels/xgemm.opencl"
+    #include "../../kernels/level3/copy.opencl"
+    #include "../../kernels/level3/pad.opencl"
+    #include "../../kernels/level3/transpose.opencl"
+    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/xgemm.opencl"
   ;
 }
 
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
index 5174e9ab..3c183083 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@@ -30,11 +30,11 @@ template <typename T, typename U>
 Xherk<T,U>::Xherk(Queue &queue, Event &event):
     Routine<T>(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
   source_string_ =
-    #include "../../kernels/copy.opencl"
-    #include "../../kernels/pad.opencl"
-    #include "../../kernels/transpose.opencl"
-    #include "../../kernels/padtranspose.opencl"
-    #include "../../kernels/xgemm.opencl"
+    #include "../../kernels/level3/copy.opencl"
+    #include "../../kernels/level3/pad.opencl"
+    #include "../../kernels/level3/transpose.opencl"
+    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/xgemm.opencl"
   ;
 }
 
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
index b36e7c5e..5b003555 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@@ -32,11 +32,11 @@ template <typename T>
 Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event):
     Routine<T>(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
   source_string_ =
-    #include "../../kernels/copy.opencl"
-    #include "../../kernels/pad.opencl"
-    #include "../../kernels/transpose.opencl"
-    #include "../../kernels/padtranspose.opencl"
-    #include "../../kernels/xgemm.opencl"
+    #include "../../kernels/level3/copy.opencl"
+    #include "../../kernels/level3/pad.opencl"
+    #include "../../kernels/level3/transpose.opencl"
+    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/xgemm.opencl"
   ;
 }
 
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
index e4668216..6ae824ba 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@@ -32,11 +32,11 @@ template <typename T>
 Xsyrk<T>::Xsyrk(Queue &queue, Event &event):
     Routine<T>(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
   source_string_ =
-    #include "../../kernels/copy.opencl"
-    #include "../../kernels/pad.opencl"
-    #include "../../kernels/transpose.opencl"
-    #include "../../kernels/padtranspose.opencl"
-    #include "../../kernels/xgemm.opencl"
+    #include "../../kernels/level3/copy.opencl"
+    #include "../../kernels/level3/pad.opencl"
+    #include "../../kernels/level3/transpose.opencl"
+    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/xgemm.opencl"
   ;
 }
 
diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc
index 23828b25..e2837e60 100644
--- a/src/tuning/copy.cc
+++ b/src/tuning/copy.cc
@@ -31,7 +31,7 @@ class TuneCopy {
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/copy.opencl"
+      #include "../src/kernels/level3/copy.opencl"
     ;
   }
 
diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc
index 6a826b6b..72729422 100644
--- a/src/tuning/pad.cc
+++ b/src/tuning/pad.cc
@@ -31,7 +31,7 @@ class TunePad {
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/pad.opencl"
+      #include "../src/kernels/level3/pad.opencl"
     ;
   }
 
diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc
index 3f233809..5edd89e0 100644
--- a/src/tuning/padtranspose.cc
+++ b/src/tuning/padtranspose.cc
@@ -31,7 +31,7 @@ class TunePadTranspose {
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/padtranspose.opencl"
+      #include "../src/kernels/level3/padtranspose.opencl"
     ;
   }
 
diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc
index 3998ba66..113e0a81 100644
--- a/src/tuning/transpose.cc
+++ b/src/tuning/transpose.cc
@@ -31,7 +31,7 @@ class TuneTranspose {
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/transpose.opencl"
+      #include "../src/kernels/level3/transpose.opencl"
     ;
   }
 
diff --git a/src/tuning/xgemm.cc b/src/tuning/xgemm.cc
index e820cfb0..c06e3e72 100644
--- a/src/tuning/xgemm.cc
+++ b/src/tuning/xgemm.cc
@@ -31,7 +31,7 @@ class TuneXgemm {
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/xgemm.opencl"
+      #include "../src/kernels/level3/xgemm.opencl"
     ;
   }
 
-- 
cgit v1.2.3