From b894611ad196fc9cac40bf5861a23b35c52c52b5 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Tue, 14 Jun 2016 18:17:58 +0200
Subject: Re-organised the level-3 supporting kernels (copy, pad, transpose,
 convert) and renamed files and functions appropriately

---
 src/kernels/level3/convert_hermitian.opencl  | 106 ++++++++
 src/kernels/level3/convert_symmetric.opencl  |  94 +++++++
 src/kernels/level3/convert_triangular.opencl |  98 ++++++++
 src/kernels/level3/copy.opencl               |  73 ------
 src/kernels/level3/copy_fast.opencl          |  56 +++++
 src/kernels/level3/copy_pad.opencl           | 109 +++++++++
 src/kernels/level3/level3.opencl             |  81 ++++++
 src/kernels/level3/pad.opencl                | 353 ---------------------------
 src/kernels/level3/padtranspose.opencl       | 164 -------------
 src/kernels/level3/transpose.opencl          | 149 -----------
 src/kernels/level3/transpose_fast.opencl     | 134 ++++++++++
 src/kernels/level3/transpose_pad.opencl      | 158 ++++++++++++
 src/routine.cc                               |   8 +-
 src/routines/level3/xgemm.cc                 |  12 +-
 src/routines/level3/xher2k.cc                |   9 +-
 src/routines/level3/xherk.cc                 |   9 +-
 src/routines/level3/xsyr2k.cc                |   9 +-
 src/routines/level3/xsyrk.cc                 |   9 +-
 src/routines/level3/xtrmm.cc                 |   2 +-
 src/tuning/copy.cc                           | 119 ---------
 src/tuning/copy_fast.cc                      | 120 +++++++++
 src/tuning/copy_pad.cc                       | 128 ++++++++++
 src/tuning/pad.cc                            | 127 ----------
 src/tuning/padtranspose.cc                   | 131 ----------
 src/tuning/transpose.cc                      | 124 ----------
 src/tuning/transpose_fast.cc                 | 125 ++++++++++
 src/tuning/transpose_pad.cc                  | 132 ++++++++++
 27 files changed, 1374 insertions(+), 1265 deletions(-)
 create mode 100644 src/kernels/level3/convert_hermitian.opencl
 create mode 100644 src/kernels/level3/convert_symmetric.opencl
 create mode 100644 src/kernels/level3/convert_triangular.opencl
 delete mode 100644 src/kernels/level3/copy.opencl
 create mode 100644 src/kernels/level3/copy_fast.opencl
 create mode 100644 src/kernels/level3/copy_pad.opencl
 create mode 100644 src/kernels/level3/level3.opencl
 delete mode 100644 src/kernels/level3/pad.opencl
 delete mode 100644 src/kernels/level3/padtranspose.opencl
 delete mode 100644 src/kernels/level3/transpose.opencl
 create mode 100644 src/kernels/level3/transpose_fast.opencl
 create mode 100644 src/kernels/level3/transpose_pad.opencl
 delete mode 100644 src/tuning/copy.cc
 create mode 100644 src/tuning/copy_fast.cc
 create mode 100644 src/tuning/copy_pad.cc
 delete mode 100644 src/tuning/pad.cc
 delete mode 100644 src/tuning/padtranspose.cc
 delete mode 100644 src/tuning/transpose.cc
 create mode 100644 src/tuning/transpose_fast.cc
 create mode 100644 src/tuning/transpose_pad.cc

(limited to 'src')

diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
new file mode 100644
index 00000000..53cc161a
--- /dev/null
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -0,0 +1,106 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to convert hermitian matrices to/from general matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464)
+
+// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void HermLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-hermitian matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) {
+            result = src[id_two*src_ld + id_one + src_offset];
+            if (id_one == id_two) { result.y = ZERO; }
+          }
+          else {
+            result = src[id_one*src_ld + id_two + src_offset];
+            COMPLEX_CONJUGATE(result);
+          }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void HermUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-hermitian matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) {
+            result = src[id_two*src_ld + id_one + src_offset];
+            if (id_one == id_two) { result.y = ZERO; }
+          }
+          else {
+            result = src[id_one*src_ld + id_two + src_offset];
+            COMPLEX_CONJUGATE(result);
+          }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
new file mode 100644
index 00000000..c6ce93ca
--- /dev/null
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -0,0 +1,94 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to convert symmetric matrices to/from general matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_SYMM)
+
+// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-symmetric matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-symmetric matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
new file mode 100644
index 00000000..fdd2461a
--- /dev/null
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -0,0 +1,98 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to convert triangular matrices to/from general matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_TRMM)
+
+// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void TriaLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int unit_diagonal) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-triangular matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_two == id_one && unit_diagonal) { SetToOne(result); }
+          // Else: result is zero
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void TriaUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int unit_diagonal) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-triangular matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_one == id_two && unit_diagonal) { SetToOne(result); }
+          // Else: result is zero
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/copy.opencl b/src/kernels/level3/copy.opencl
deleted file mode 100644
index 7dde688b..00000000
--- a/src/kernels/level3/copy.opencl
+++ /dev/null
@@ -1,73 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains the common kernels shared among different BLAS routines. This file contains
-// kernels to copy matrices.
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef COPY_DIMX
-  #define COPY_DIMX 8      // Local workgroup size in the first dimension (x)
-#endif
-#ifndef COPY_DIMY
-  #define COPY_DIMY 8      // Local workgroup size in the second dimension (y)
-#endif
-#ifndef COPY_WPT
-  #define COPY_WPT 1       // Work per thread in the first dimension (x)
-#endif
-#ifndef COPY_VW
-  #define COPY_VW 1        // Vector width in the second dimension (y)
-#endif
-
-// =================================================================================================
-
-// Data-widths
-#if COPY_VW == 1
-  typedef real realC;
-#elif COPY_VW == 2
-  typedef real2 realC;
-#elif COPY_VW == 4
-  typedef real4 realC;
-#elif COPY_VW == 8
-  typedef real8 realC;
-#elif COPY_VW == 16
-  typedef real16 realC;
-#endif
-
-// =================================================================================================
-
-// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
-// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
-__kernel void CopyMatrix(const int ld,
-                         __global const realC* restrict src,
-                         __global realC* dest) {
-  #pragma unroll
-  for (int w_one=0; w_one<COPY_WPT; ++w_one) {
-    const int id_one = get_global_id(0);
-    const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
-    const int id = id_two*(ld/COPY_VW) + id_one;
-    dest[id] = src[id];
-  }
-}
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
new file mode 100644
index 00000000..bfbfacd4
--- /dev/null
+++ b/src/kernels/level3/copy_fast.opencl
@@ -0,0 +1,56 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS routines. This file contains
+// kernels to copy matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Data-widths
+#if COPY_VW == 1
+  typedef real realC;
+#elif COPY_VW == 2
+  typedef real2 realC;
+#elif COPY_VW == 4
+  typedef real4 realC;
+#elif COPY_VW == 8
+  typedef real8 realC;
+#elif COPY_VW == 16
+  typedef real16 realC;
+#endif
+
+// =================================================================================================
+
+// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
+// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
+__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+__kernel void CopyMatrixFast(const int ld,
+                             __global const realC* restrict src,
+                             __global realC* dest) {
+  #pragma unroll
+  for (int w_one=0; w_one<COPY_WPT; ++w_one) {
+    const int id_one = get_global_id(0);
+    const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
+    const int id = id_two*(ld/COPY_VW) + id_one;
+    dest[id] = src[id];
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
new file mode 100644
index 00000000..f211af0f
--- /dev/null
+++ b/src/kernels/level3/copy_pad.opencl
@@ -0,0 +1,109 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// kernels to copy and pad matrices in various ways, including:
+// 1) copying into a larger matrix by adding padding
+// 2) copying into a smaller matrix by optionally removing padding. This is the general version
+//    without restrictions, see the 'copy.opencl' file for a faster but more restricted copy kernel.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Copies a matrix from source to destination. The output is padded with zero values in case the
+// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
+// value and offset can be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void CopyPadMatrix(const int src_one, const int src_two,
+                            const int src_ld, const int src_offset,
+                            __global const real* restrict src,
+                            const int dest_one, const int dest_two,
+                            const int dest_ld, const int dest_offset,
+                            __global real* dest,
+                            const int do_conjugate) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_two && id_one < dest_one) {
+
+        // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the
+        // value to be written to zero.
+        real value;
+        SetToZero(value);
+        if (id_two < src_two && id_one < src_one) {
+          value = src[id_two*src_ld + id_one + src_offset];
+        }
+
+        // Stores the value in the destination matrix
+        if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
+        dest[id_two*dest_ld + id_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
+// writes only the actual data back to the destination matrix. Again, the ld value and offset can
+// be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void CopyMatrix(const int src_one, const int src_two,
+                         const int src_ld, const int src_offset,
+                         __global const real* restrict src,
+                         const int dest_one, const int dest_two,
+                         const int dest_ld, const int dest_offset,
+                         __global real* dest,
+                         const int upper, const int lower,
+                         const int diagonal_imag_zero) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
+        if (upper == 1) { condition = (id_two >= id_one); }
+        else if (lower == 1) { condition = (id_two <= id_one); }
+      #endif
+      if (condition) {
+
+        // Copies the value into the destination matrix. This is always within bounds of the source
+        // matrix, as we know that the destination matrix is smaller or equal to the source.
+        if (id_two < dest_two && id_one < dest_one) {
+          real value = src[id_two*src_ld + id_one + src_offset];
+          if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
+          dest[id_two*dest_ld + id_one + dest_offset] = value;
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl
new file mode 100644
index 00000000..bf14ab12
--- /dev/null
+++ b/src/kernels/level3/level3.opencl
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common functions and parameters specific for level 3 BLAS kernels.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+
+// For the 'fast' copy kernel
+#ifndef COPY_DIMX
+  #define COPY_DIMX 8      // Local workgroup size in the first dimension (x)
+#endif
+#ifndef COPY_DIMY
+  #define COPY_DIMY 8      // Local workgroup size in the second dimension (y)
+#endif
+#ifndef COPY_WPT
+  #define COPY_WPT 1       // Work per thread in the first dimension (x)
+#endif
+#ifndef COPY_VW
+  #define COPY_VW 1        // Vector width in the second dimension (y)
+#endif
+
+// For the padding/copy kernels and the conversion kernels
+#ifndef PAD_DIMX
+  #define PAD_DIMX 8      // Local workgroup size in the first dimension (x)
+#endif
+#ifndef PAD_DIMY
+  #define PAD_DIMY 8      // Local workgroup size in the second dimension (y)
+#endif
+#ifndef PAD_WPTX
+  #define PAD_WPTX 1      // Work per thread in the first dimension (x)
+#endif
+#ifndef PAD_WPTY
+  #define PAD_WPTY 1      // Work per thread in the second dimension (y)
+#endif
+
+// For the 'fast' transpose kernel
+#ifndef TRA_DIM
+  #define TRA_DIM 8       // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef TRA_WPT
+  #define TRA_WPT 1       // Work per thread in one dimension and vector-width in the other
+#endif
+#ifndef TRA_PAD
+  #define TRA_PAD 0       // Padding of the local memory to avoid bank-conflicts
+#endif
+#ifndef TRA_SHUFFLE
+  #define TRA_SHUFFLE 0   // Shuffling of the global indices to avoid global memory bank-conflicts
+#endif
+
+// For the padding/transpose kernels
+#ifndef PADTRA_TILE
+  #define PADTRA_TILE 8   // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef PADTRA_WPT
+  #define PADTRA_WPT 1    // Amount of work per thread
+#endif
+#ifndef PADTRA_PAD
+  #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
+#endif
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/pad.opencl b/src/kernels/level3/pad.opencl
deleted file mode 100644
index eefddce4..00000000
--- a/src/kernels/level3/pad.opencl
+++ /dev/null
@@ -1,353 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains the common kernels shared among different BLAS routines. This file contains
-// kernels to copy and pad matrices in various ways, including:
-// 1) copying into a larger matrix by adding padding
-// 2) copying into a smaller matrix by removing padding
-// 3) from upper/lower triangle into a full matrix
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef PAD_DIMX
-  #define PAD_DIMX 8      // Local workgroup size in the first dimension (x)
-#endif
-#ifndef PAD_DIMY
-  #define PAD_DIMY 8      // Local workgroup size in the second dimension (y)
-#endif
-#ifndef PAD_WPTX
-  #define PAD_WPTX 1      // Work per thread in the first dimension (x)
-#endif
-#ifndef PAD_WPTY
-  #define PAD_WPTY 1      // Work per thread in the second dimension (y)
-#endif
-
-// =================================================================================================
-
-// Copies a matrix from source to destination. The output is padded with zero values in case the
-// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
-// value and offset can be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void PadMatrix(const int src_one, const int src_two,
-                        const int src_ld, const int src_offset,
-                        __global const real* restrict src,
-                        const int dest_one, const int dest_two,
-                        const int dest_ld, const int dest_offset,
-                        __global real* dest,
-                        const int do_conjugate) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_two && id_one < dest_one) {
-
-        // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the
-        // value to be written to zero.
-        real value;
-        SetToZero(value);
-        if (id_two < src_two && id_one < src_one) {
-          value = src[id_two*src_ld + id_one + src_offset];
-        }
-
-        // Stores the value in the destination matrix
-        if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-        dest[id_two*dest_ld + id_one + dest_offset] = value;
-      }
-    }
-  }
-}
-
-// =================================================================================================
-
-// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
-// writes only the actual data back to the destination matrix. Again, the ld value and offset can
-// be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void UnPadMatrix(const int src_one, const int src_two,
-                          const int src_ld, const int src_offset,
-                          __global const real* restrict src,
-                          const int dest_one, const int dest_two,
-                          const int dest_ld, const int dest_offset,
-                          __global real* dest,
-                          const int upper, const int lower,
-                          const int diagonal_imag_zero) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-
-      // Masking in case of triangular matrices: updates only the upper or lower part
-      bool condition = true;
-      if (upper == 1) { condition = (id_two >= id_one); }
-      else if (lower == 1) { condition = (id_two <= id_one); }
-      if (condition) {
-
-        // Copies the value into the destination matrix. This is always within bounds of the source
-        // matrix, as we know that the destination matrix is smaller than the source.
-        if (id_two < dest_two && id_one < dest_one) {
-          real value = src[id_two*src_ld + id_one + src_offset];
-          if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
-          dest[id_two*dest_ld + id_one + dest_offset] = value;
-        }
-      }
-    }
-  }
-}
-
-// =================================================================================================
-#if defined(ROUTINE_SYMM)
-
-// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
-// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the lower-symmetric matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the upper-symmetric matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-#endif
-// =================================================================================================
-#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464)
-
-// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
-// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the lower-hermitian matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) {
-            result = src[id_two*src_ld + id_one + src_offset];
-            if (id_one == id_two) { result.y = ZERO; }
-          }
-          else {
-            result = src[id_one*src_ld + id_two + src_offset];
-            COMPLEX_CONJUGATE(result);
-          }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the upper-hermitian matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) {
-            result = src[id_two*src_ld + id_one + src_offset];
-            if (id_one == id_two) { result.y = ZERO; }
-          }
-          else {
-            result = src[id_one*src_ld + id_two + src_offset];
-            COMPLEX_CONJUGATE(result);
-          }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-#endif
-// =================================================================================================
-#if defined(ROUTINE_TRMM)
-
-// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
-// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TrmmLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the lower-triangular matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
-          if (id_two == id_one && unit_diagonal) { SetToOne(result); }
-          // Else: result is zero
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TrmmUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the upper-triangular matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
-          if (id_one == id_two && unit_diagonal) { SetToOne(result); }
-          // Else: result is zero
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-#endif
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/kernels/level3/padtranspose.opencl b/src/kernels/level3/padtranspose.opencl
deleted file mode 100644
index a6b70f0b..00000000
--- a/src/kernels/level3/padtranspose.opencl
+++ /dev/null
@@ -1,164 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains the common kernels shared among different BLAS functions. This file contains
-// kernels to transpose matrices in various ways, including:
-// 1) transposing into a larger matrix by adding padding
-// 2) transposing into a smaller matrix by removing padding
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef PADTRA_TILE
-  #define PADTRA_TILE 8   // Number of local threads in the two dimensions (x,y)
-#endif
-#ifndef PADTRA_WPT
-  #define PADTRA_WPT 1    // Amount of work per thread
-#endif
-#ifndef PADTRA_PAD
-  #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
-#endif
-
-// =================================================================================================
-
-// Same as PadCopyMatrix, but now also does the transpose
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void PadTransposeMatrix(const int src_one, const int src_two,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_one, const int dest_two,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int do_conjugate) {
-
-  // Local memory to store a tile of the matrix (for coalescing)
-  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
-
-  // Loop over the work per thread
-  #pragma unroll
-  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
-    #pragma unroll
-    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
-
-      // Computes the identifiers for the source matrix. Note that the local and global dimensions
-      // do not correspond to each other!
-      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
-      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
-
-      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
-      // Otherwise, set the local memory value to zero.
-      real value;
-      SetToZero(value);
-      if (id_src_two < src_two && id_src_one < src_one) {
-        value = src[id_src_two*src_ld + id_src_one + src_offset];
-      }
-      tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
-    }
-  }
-
-  // Synchronizes all threads in a workgroup
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // Loop over the work per thread
-  #pragma unroll
-  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
-    #pragma unroll
-    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
-
-      // Computes the identifiers for the destination matrix
-      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
-      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
-
-      // Stores the transposed value in the destination matrix
-      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
-        if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
-      }
-    }
-  }
-}
-
-// =================================================================================================
-
-// Same as UnPadCopyMatrix, but now also does the transpose
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
-                                   const int src_ld, const int src_offset,
-                                   __global const real* restrict src,
-                                   const int dest_one, const int dest_two,
-                                   const int dest_ld, const int dest_offset,
-                                   __global real* dest,
-                                   const int upper, const int lower,
-                                   const int diagonal_imag_zero) {
-
-  // Local memory to store a tile of the matrix (for coalescing)
-  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
-
-  // Loop over the work per thread
-  #pragma unroll
-  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
-    #pragma unroll
-    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
-
-      // Computes the identifiers for the source matrix. Note that the local and global dimensions
-      // do not correspond to each other!
-      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
-      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
-
-      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
-      if ((id_src_one < src_one) && (id_src_two < src_two)) {
-        real value = src[id_src_two*src_ld + id_src_one + src_offset];
-        tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
-      }
-    }
-  }
-
-  // Synchronizes all threads in a workgroup
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // Loop over the work per thread
-  #pragma unroll
-  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
-    #pragma unroll
-    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
-
-      // Computes the identifiers for the destination matrix
-      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
-      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
-
-      // Masking in case of triangular matrices: updates only the upper or lower part
-      bool condition = true;
-      if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
-      else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
-      if (condition) {
-
-        // Stores the transposed value in the destination matrix
-        if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-          real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
-          if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
-          dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
-        }
-      }
-    }
-  }
-}
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/kernels/level3/transpose.opencl b/src/kernels/level3/transpose.opencl
deleted file mode 100644
index d726f7ec..00000000
--- a/src/kernels/level3/transpose.opencl
+++ /dev/null
@@ -1,149 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains the common kernels shared among different BLAS functions. This file contains
-// kernels to transpose matrices.
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef TRA_DIM
-  #define TRA_DIM 8       // Number of local threads in the two dimensions (x,y)
-#endif
-#ifndef TRA_WPT
-  #define TRA_WPT 1       // Work per thread in one dimension and vector-width in the other
-#endif
-#ifndef TRA_PAD
-  #define TRA_PAD 0       // Padding of the local memory to avoid bank-conflicts
-#endif
-#ifndef TRA_SHUFFLE
-  #define TRA_SHUFFLE 0   // Shuffling of the global indices to avoid global memory bank-conflicts
-#endif
-
-// =================================================================================================
-
-// Data-widths
-#if TRA_WPT == 1
-  typedef real realT;
-#elif TRA_WPT == 2
-  typedef real2 realT;
-#elif TRA_WPT == 4
-  typedef real4 realT;
-#elif TRA_WPT == 8
-  typedef real8 realT;
-#elif TRA_WPT == 16
-  typedef real16 realT;
-#endif
-
-// =================================================================================================
-
-// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
-// offset. A more general version is available in 'padtranspose.opencl'.
-__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
-__kernel void TransposeMatrix(const int ld,
-                              __global const realT* restrict src,
-                              __global realT* dest) {
-
-  // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
-  // way over workgroups, breaking memory-bank dependencies.
-  const int gid0 = get_group_id(0);
-  #if TRA_SHUFFLE == 1
-    const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
-  #else
-    const int gid1 = get_group_id(1);
-  #endif
-
-  // Local memory to store a tile of the matrix (for coalescing)
-  __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD];
-
-  // Loops over the work per thread
-  #pragma unroll
-  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
-
-    // Computes the identifiers for the source matrix. Note that the local and global dimensions
-    // do not correspond to each other!
-    const int id_one = gid1 * TRA_DIM + get_local_id(0);
-    const int id_two = (gid0 * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
-
-    // Loads data into the local memory
-    realT value = src[id_two*(ld/TRA_WPT) + id_one];
-    tile[get_local_id(0)*TRA_WPT + w_one][get_local_id(1)] = value;
-  }
-
-  // Synchronizes all threads in a workgroup
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  // Loads transposed data from the local memory
-  realT v[TRA_WPT];
-  #pragma unroll
-  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
-    v[w_one] = tile[get_local_id(1)*TRA_WPT + w_one][get_local_id(0)];
-  }
-
-  // Performs the register-level transpose of the vectorized data
-  realT results[TRA_WPT];
-  #if TRA_WPT == 1
-    results[0] = v[0];
-  #elif TRA_WPT == 2
-    results[0] = (realT) {v[0].x, v[1].x};
-    results[1] = (realT) {v[0].y, v[1].y};
-  #elif TRA_WPT == 4
-    results[0] = (realT) {v[0].x, v[1].x, v[2].x, v[3].x};
-    results[1] = (realT) {v[0].y, v[1].y, v[2].y, v[3].y};
-    results[2] = (realT) {v[0].z, v[1].z, v[2].z, v[3].z};
-    results[3] = (realT) {v[0].w, v[1].w, v[2].w, v[3].w};
-  #elif TRA_WPT == 8
-    results[0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0};
-    results[1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1};
-    results[2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2};
-    results[3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3};
-    results[4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4};
-    results[5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5};
-    results[6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6};
-    results[7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7};
-  #elif TRA_WPT == 16
-    results[ 0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0, v[8].s0, v[9].s0, v[10].s0, v[11].s0, v[12].s0, v[13].s0, v[14].s0, v[15].s0};
-    results[ 1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1, v[8].s1, v[9].s1, v[10].s1, v[11].s1, v[12].s1, v[13].s1, v[14].s1, v[15].s1};
-    results[ 2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2, v[8].s2, v[9].s2, v[10].s2, v[11].s2, v[12].s2, v[13].s2, v[14].s2, v[15].s2};
-    results[ 3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3, v[8].s3, v[9].s3, v[10].s3, v[11].s3, v[12].s3, v[13].s3, v[14].s3, v[15].s3};
-    results[ 4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4, v[8].s4, v[9].s4, v[10].s4, v[11].s4, v[12].s4, v[13].s4, v[14].s4, v[15].s4};
-    results[ 5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5, v[8].s5, v[9].s5, v[10].s5, v[11].s5, v[12].s5, v[13].s5, v[14].s5, v[15].s5};
-    results[ 6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6, v[8].s6, v[9].s6, v[10].s6, v[11].s6, v[12].s6, v[13].s6, v[14].s6, v[15].s6};
-    results[ 7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7, v[8].s7, v[9].s7, v[10].s7, v[11].s7, v[12].s7, v[13].s7, v[14].s7, v[15].s7};
-    results[ 8] = (realT) {v[0].s8, v[1].s8, v[2].s8, v[3].s8, v[4].s8, v[5].s8, v[6].s8, v[7].s8, v[8].s8, v[9].s8, v[10].s8, v[11].s8, v[12].s8, v[13].s8, v[14].s8, v[15].s8};
-    results[ 9] = (realT) {v[0].s9, v[1].s9, v[2].s9, v[3].s9, v[4].s9, v[5].s9, v[6].s9, v[7].s9, v[8].s9, v[9].s9, v[10].s9, v[11].s9, v[12].s9, v[13].s9, v[14].s9, v[15].s9};
-    results[10] = (realT) {v[0].sA, v[1].sA, v[2].sA, v[3].sA, v[4].sA, v[5].sA, v[6].sA, v[7].sA, v[8].sA, v[9].sA, v[10].sA, v[11].sA, v[12].sA, v[13].sA, v[14].sA, v[15].sA};
-    results[11] = (realT) {v[0].sB, v[1].sB, v[2].sB, v[3].sB, v[4].sB, v[5].sB, v[6].sB, v[7].sB, v[8].sB, v[9].sB, v[10].sB, v[11].sB, v[12].sB, v[13].sB, v[14].sB, v[15].sB};
-    results[12] = (realT) {v[0].sC, v[1].sC, v[2].sC, v[3].sC, v[4].sC, v[5].sC, v[6].sC, v[7].sC, v[8].sC, v[9].sC, v[10].sC, v[11].sC, v[12].sC, v[13].sC, v[14].sC, v[15].sC};
-    results[13] = (realT) {v[0].sD, v[1].sD, v[2].sD, v[3].sD, v[4].sD, v[5].sD, v[6].sD, v[7].sD, v[8].sD, v[9].sD, v[10].sD, v[11].sD, v[12].sD, v[13].sD, v[14].sD, v[15].sD};
-    results[14] = (realT) {v[0].sE, v[1].sE, v[2].sE, v[3].sE, v[4].sE, v[5].sE, v[6].sE, v[7].sE, v[8].sE, v[9].sE, v[10].sE, v[11].sE, v[12].sE, v[13].sE, v[14].sE, v[15].sE};
-    results[15] = (realT) {v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF};
-  #endif
-
-  // Stores the results into the destination matrix
-  #pragma unroll
-  for (int w_two=0; w_two<TRA_WPT; ++w_two) {
-    const int id_one = gid0*TRA_DIM + get_local_id(0);
-    const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
-    dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
-  }
-}
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
new file mode 100644
index 00000000..08266461
--- /dev/null
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -0,0 +1,134 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// a kernel to transpose matrices. This is a 'fast' version with restrictions, see the
+// 'padtranspose.opencl' file for a general transpose kernel.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Data-widths
+#if TRA_WPT == 1
+  typedef real realT;
+#elif TRA_WPT == 2
+  typedef real2 realT;
+#elif TRA_WPT == 4
+  typedef real4 realT;
+#elif TRA_WPT == 8
+  typedef real8 realT;
+#elif TRA_WPT == 16
+  typedef real16 realT;
+#endif
+
+// =================================================================================================
+
+// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
+// offset. A more general version is available in 'padtranspose.opencl'.
+__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+__kernel void TransposeMatrixFast(const int ld,
+                                  __global const realT* restrict src,
+                                  __global realT* dest) {
+
+  // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
+  // way over workgroups, breaking memory-bank dependencies.
+  const int gid0 = get_group_id(0);
+  #if TRA_SHUFFLE == 1
+    const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
+  #else
+    const int gid1 = get_group_id(1);
+  #endif
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD];
+
+  // Loops over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
+
+    // Computes the identifiers for the source matrix. Note that the local and global dimensions
+    // do not correspond to each other!
+    const int id_one = gid1 * TRA_DIM + get_local_id(0);
+    const int id_two = (gid0 * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
+
+    // Loads data into the local memory
+    realT value = src[id_two*(ld/TRA_WPT) + id_one];
+    tile[get_local_id(0)*TRA_WPT + w_one][get_local_id(1)] = value;
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loads transposed data from the local memory
+  realT v[TRA_WPT];
+  #pragma unroll
+  for (int w_one=0; w_one<TRA_WPT; ++w_one) {
+    v[w_one] = tile[get_local_id(1)*TRA_WPT + w_one][get_local_id(0)];
+  }
+
+  // Performs the register-level transpose of the vectorized data
+  realT results[TRA_WPT];
+  #if TRA_WPT == 1
+    results[0] = v[0];
+  #elif TRA_WPT == 2
+    results[0] = (realT) {v[0].x, v[1].x};
+    results[1] = (realT) {v[0].y, v[1].y};
+  #elif TRA_WPT == 4
+    results[0] = (realT) {v[0].x, v[1].x, v[2].x, v[3].x};
+    results[1] = (realT) {v[0].y, v[1].y, v[2].y, v[3].y};
+    results[2] = (realT) {v[0].z, v[1].z, v[2].z, v[3].z};
+    results[3] = (realT) {v[0].w, v[1].w, v[2].w, v[3].w};
+  #elif TRA_WPT == 8
+    results[0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0};
+    results[1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1};
+    results[2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2};
+    results[3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3};
+    results[4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4};
+    results[5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5};
+    results[6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6};
+    results[7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7};
+  #elif TRA_WPT == 16
+    results[ 0] = (realT) {v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0, v[8].s0, v[9].s0, v[10].s0, v[11].s0, v[12].s0, v[13].s0, v[14].s0, v[15].s0};
+    results[ 1] = (realT) {v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1, v[8].s1, v[9].s1, v[10].s1, v[11].s1, v[12].s1, v[13].s1, v[14].s1, v[15].s1};
+    results[ 2] = (realT) {v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2, v[8].s2, v[9].s2, v[10].s2, v[11].s2, v[12].s2, v[13].s2, v[14].s2, v[15].s2};
+    results[ 3] = (realT) {v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3, v[8].s3, v[9].s3, v[10].s3, v[11].s3, v[12].s3, v[13].s3, v[14].s3, v[15].s3};
+    results[ 4] = (realT) {v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4, v[8].s4, v[9].s4, v[10].s4, v[11].s4, v[12].s4, v[13].s4, v[14].s4, v[15].s4};
+    results[ 5] = (realT) {v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5, v[8].s5, v[9].s5, v[10].s5, v[11].s5, v[12].s5, v[13].s5, v[14].s5, v[15].s5};
+    results[ 6] = (realT) {v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6, v[8].s6, v[9].s6, v[10].s6, v[11].s6, v[12].s6, v[13].s6, v[14].s6, v[15].s6};
+    results[ 7] = (realT) {v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7, v[8].s7, v[9].s7, v[10].s7, v[11].s7, v[12].s7, v[13].s7, v[14].s7, v[15].s7};
+    results[ 8] = (realT) {v[0].s8, v[1].s8, v[2].s8, v[3].s8, v[4].s8, v[5].s8, v[6].s8, v[7].s8, v[8].s8, v[9].s8, v[10].s8, v[11].s8, v[12].s8, v[13].s8, v[14].s8, v[15].s8};
+    results[ 9] = (realT) {v[0].s9, v[1].s9, v[2].s9, v[3].s9, v[4].s9, v[5].s9, v[6].s9, v[7].s9, v[8].s9, v[9].s9, v[10].s9, v[11].s9, v[12].s9, v[13].s9, v[14].s9, v[15].s9};
+    results[10] = (realT) {v[0].sA, v[1].sA, v[2].sA, v[3].sA, v[4].sA, v[5].sA, v[6].sA, v[7].sA, v[8].sA, v[9].sA, v[10].sA, v[11].sA, v[12].sA, v[13].sA, v[14].sA, v[15].sA};
+    results[11] = (realT) {v[0].sB, v[1].sB, v[2].sB, v[3].sB, v[4].sB, v[5].sB, v[6].sB, v[7].sB, v[8].sB, v[9].sB, v[10].sB, v[11].sB, v[12].sB, v[13].sB, v[14].sB, v[15].sB};
+    results[12] = (realT) {v[0].sC, v[1].sC, v[2].sC, v[3].sC, v[4].sC, v[5].sC, v[6].sC, v[7].sC, v[8].sC, v[9].sC, v[10].sC, v[11].sC, v[12].sC, v[13].sC, v[14].sC, v[15].sC};
+    results[13] = (realT) {v[0].sD, v[1].sD, v[2].sD, v[3].sD, v[4].sD, v[5].sD, v[6].sD, v[7].sD, v[8].sD, v[9].sD, v[10].sD, v[11].sD, v[12].sD, v[13].sD, v[14].sD, v[15].sD};
+    results[14] = (realT) {v[0].sE, v[1].sE, v[2].sE, v[3].sE, v[4].sE, v[5].sE, v[6].sE, v[7].sE, v[8].sE, v[9].sE, v[10].sE, v[11].sE, v[12].sE, v[13].sE, v[14].sE, v[15].sE};
+    results[15] = (realT) {v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF};
+  #endif
+
+  // Stores the results into the destination matrix
+  #pragma unroll
+  for (int w_two=0; w_two<TRA_WPT; ++w_two) {
+    const int id_one = gid0*TRA_DIM + get_local_id(0);
+    const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
+    dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
new file mode 100644
index 00000000..38c23346
--- /dev/null
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -0,0 +1,158 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// kernels to transpose matrices in various ways, including:
+// 1) transposing into a larger matrix by adding padding
+// 2) transposing into a smaller matrix by optionally removing padding. This is the general version
+//    without restrictions, see the 'transpose.opencl' file for a faster but more restricted
+//    transpose kernel.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Transposes a matrix from source to destination. The output is padded with zero values in case the
+// destination matrix dimensions are larger than the transposed source matrix dimensions.
+__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+__kernel void TransposePadMatrix(const int src_one, const int src_two,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_one, const int dest_two,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int do_conjugate) {
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the source matrix. Note that the local and global dimensions
+      // do not correspond to each other!
+      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
+      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
+
+      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
+      // Otherwise, set the local memory value to zero.
+      real value;
+      SetToZero(value);
+      if (id_src_two < src_two && id_src_one < src_one) {
+        value = src[id_src_two*src_ld + id_src_one + src_offset];
+      }
+      tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+    }
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the destination matrix
+      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
+      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
+
+      // Stores the transposed value in the destination matrix
+      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+        if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
+        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
+// padded source matrix, but only the actual data is written back to the transposed destination
+// matrix. This kernel optionally checks for upper/lower triangular matrices.
+__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+__kernel void TransposeMatrix(const int src_one, const int src_two,
+                              const int src_ld, const int src_offset,
+                              __global const real* restrict src,
+                              const int dest_one, const int dest_two,
+                              const int dest_ld, const int dest_offset,
+                              __global real* dest,
+                              const int upper, const int lower,
+                              const int diagonal_imag_zero) {
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the source matrix. Note that the local and global dimensions
+      // do not correspond to each other!
+      const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0);
+      const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1);
+
+      // Loads data into the local memory if the thread IDs are within bounds of the source matrix.
+      if ((id_src_one < src_one) && (id_src_two < src_two)) {
+        real value = src[id_src_two*src_ld + id_src_one + src_offset];
+        tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
+      }
+    }
+  }
+
+  // Synchronizes all threads in a workgroup
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Loop over the work per thread
+  #pragma unroll
+  for (int w_one=0; w_one<PADTRA_WPT; ++w_one) {
+    #pragma unroll
+    for (int w_two=0; w_two<PADTRA_WPT; ++w_two) {
+
+      // Computes the identifiers for the destination matrix
+      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
+      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
+
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
+        if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
+        else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
+      #endif
+      if (condition) {
+
+        // Stores the transposed value in the destination matrix
+        if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+          real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+          if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
+          dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routine.cc b/src/routine.cc
index ca283b52..dee1f090 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -319,11 +319,11 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
         IsMultiple(src_ld, db_["TRA_WPT"]) &&
         IsMultiple(src_one, db_["TRA_WPT"]*db_["TRA_WPT"]) &&
         IsMultiple(src_two, db_["TRA_WPT"]*db_["TRA_WPT"])) {
-      kernel_name = "TransposeMatrix";
+      kernel_name = "TransposeMatrixFast";
     }
     else {
       use_fast_kernel = false;
-      kernel_name = (do_pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
+      kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
     }
   }
   else {
@@ -331,11 +331,11 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
         IsMultiple(src_ld, db_["COPY_VW"]) &&
         IsMultiple(src_one, db_["COPY_VW"]*db_["COPY_DIMX"]) &&
         IsMultiple(src_two, db_["COPY_WPT"]*db_["COPY_DIMY"])) {
-      kernel_name = "CopyMatrix";
+      kernel_name = "CopyMatrixFast";
     }
     else {
       use_fast_kernel = false;
-      kernel_name = (do_pad) ? "PadMatrix" : "UnPadMatrix";
+      kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
     }
   }
 
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index ab36076c..d08b6038 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -33,10 +33,14 @@ template <typename T>
 Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
     Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
   source_string_ =
-    #include "../../kernels/level3/copy.opencl"
-    #include "../../kernels/level3/pad.opencl"
-    #include "../../kernels/level3/transpose.opencl"
-    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
+    #include "../../kernels/level3/convert_symmetric.opencl"
+    #include "../../kernels/level3/convert_triangular.opencl"
+    #include "../../kernels/level3/convert_hermitian.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
   ;
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
index 1acba517..e83d105f 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@@ -30,10 +30,11 @@ template <typename T, typename U>
 Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
     Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
   source_string_ =
-    #include "../../kernels/level3/copy.opencl"
-    #include "../../kernels/level3/pad.opencl"
-    #include "../../kernels/level3/transpose.opencl"
-    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
   ;
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
index ea1aa614..9ab50dd2 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@@ -30,10 +30,11 @@ template <typename T, typename U>
 Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
     Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
   source_string_ =
-    #include "../../kernels/level3/copy.opencl"
-    #include "../../kernels/level3/pad.opencl"
-    #include "../../kernels/level3/transpose.opencl"
-    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
   ;
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
index 4f86bac5..49fbe64b 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@@ -33,10 +33,11 @@ template <typename T>
 Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
     Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
   source_string_ =
-    #include "../../kernels/level3/copy.opencl"
-    #include "../../kernels/level3/pad.opencl"
-    #include "../../kernels/level3/transpose.opencl"
-    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
   ;
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
index 52cb58c0..9913c7ca 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@@ -33,10 +33,11 @@ template <typename T>
 Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
     Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
   source_string_ =
-    #include "../../kernels/level3/copy.opencl"
-    #include "../../kernels/level3/pad.opencl"
-    #include "../../kernels/level3/transpose.opencl"
-    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
   ;
diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc
index 18cbb1c0..27ecb4ed 100644
--- a/src/routines/level3/xtrmm.cc
+++ b/src/routines/level3/xtrmm.cc
@@ -51,7 +51,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
   // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
   bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
                    (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared";
+  auto kernel_name = (is_upper) ? "TriaUpperToSquared" : "TriaLowerToSquared";
 
   // Determines whether or not the triangular matrix is unit-diagonal
   auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc
deleted file mode 100644
index 09cdecf1..00000000
--- a/src/tuning/copy.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels.
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-
-#include "internal/utilities.h"
-#include "internal/tuning.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TuneCopy {
- public:
-
-  // The representative kernel and the source code
-  static std::string KernelFamily() { return "copy"; }
-  static std::string KernelName() { return "CopyMatrix"; }
-  static std::string GetSources() {
-    return
-      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/copy.opencl"
-    ;
-  }
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
-
-  // Tests for valid arguments
-  static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the default values for the arguments
-  static size_t DefaultM() { return 1024; }
-  static size_t DefaultN() { return 1024; }
-  static size_t DefaultK() { return 1; } // N/A for this kernel
-  static double DefaultFraction() { return 1.0; } // N/A for this kernel
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
-
-  // Sets the tuning parameters and their possible values
-  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32});
-    tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32});
-    tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8});
-    tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8});
-  }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
-
-  // Sets the base thread configuration
-  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
-  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
-  static std::vector<size_t> LocalSize() { return {1, 1}; }
-  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
-
-  // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
-  static TransformVector MulLocal() { return {{"COPY_DIMX", "COPY_DIMY"}}; }
-  static TransformVector DivLocal() { return {}; }
-  static TransformVector MulGlobal() { return {}; }
-  static TransformVector DivGlobal() { return {{"COPY_VW", "COPY_WPT"}}; }
-
-  // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentOutput(b_mat);
-  }
-
-  // Describes how to compute the performance metrics
-  static size_t GetMetric(const Arguments<T> &args) {
-    return 2 * args.m * args.n * GetBytes(args.precision);
-  }
-  static std::string PerformanceUnit() { return "GB/s"; }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneCopy<half>, half>(argc, argv); break;
-    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneCopy<float>, float>(argc, argv); break;
-    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneCopy<double>, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneCopy<float2>, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneCopy<double2>, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/src/tuning/copy_fast.cc b/src/tuning/copy_fast.cc
new file mode 100644
index 00000000..2da707be
--- /dev/null
+++ b/src/tuning/copy_fast.cc
@@ -0,0 +1,120 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TuneCopy {
+ public:
+
+  // The representative kernel and the source code
+  static std::string KernelFamily() { return "copy"; }
+  static std::string KernelName() { return "CopyMatrixFast"; }
+  static std::string GetSources() {
+    return
+      #include "../src/kernels/common.opencl"
+      #include "../src/kernels/level3/level3.opencl"
+      #include "../src/kernels/level3/copy_fast.opencl"
+    ;
+  }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+
+  // Tests for valid arguments
+  static void TestValidArguments(const Arguments<T> &) { }
+
+  // Sets the default values for the arguments
+  static size_t DefaultM() { return 1024; }
+  static size_t DefaultN() { return 1024; }
+  static size_t DefaultK() { return 1; } // N/A for this kernel
+  static double DefaultFraction() { return 1.0; } // N/A for this kernel
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
+
+  // Sets the tuning parameters and their possible values
+  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
+    tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32});
+    tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32});
+    tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8});
+    tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8});
+  }
+
+  // Sets the constraints and local memory size
+  static void SetConstraints(cltune::Tuner &, const size_t) { }
+  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+
+  // Sets the base thread configuration
+  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
+  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
+  static std::vector<size_t> LocalSize() { return {1, 1}; }
+  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
+
+  // Transforms the thread configuration based on the parameters
+  using TransformVector = std::vector<std::vector<std::string>>;
+  static TransformVector MulLocal() { return {{"COPY_DIMX", "COPY_DIMY"}}; }
+  static TransformVector DivLocal() { return {}; }
+  static TransformVector MulGlobal() { return {}; }
+  static TransformVector DivGlobal() { return {{"COPY_VW", "COPY_WPT"}}; }
+
+  // Sets the kernel's arguments
+  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
+                           std::vector<T> &, std::vector<T> &,
+                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
+                           std::vector<T> &) {
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentInput(a_mat);
+    tuner.AddArgumentOutput(b_mat);
+  }
+
+  // Describes how to compute the performance metrics
+  static size_t GetMetric(const Arguments<T> &args) {
+    return 2 * args.m * args.n * GetBytes(args.precision);
+  }
+  static std::string PerformanceUnit() { return "GB/s"; }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneCopy<half>, half>(argc, argv); break;
+    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneCopy<float>, float>(argc, argv); break;
+    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneCopy<double>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneCopy<float2>, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneCopy<double2>, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/copy_pad.cc b/src/tuning/copy_pad.cc
new file mode 100644
index 00000000..ec392471
--- /dev/null
+++ b/src/tuning/copy_pad.cc
@@ -0,0 +1,128 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TunePad {
+ public:
+
+  // The representative kernel and the source code
+  static std::string KernelFamily() { return "pad"; }
+  static std::string KernelName() { return "CopyPadMatrix"; }
+  static std::string GetSources() {
+    return
+      #include "../src/kernels/common.opencl"
+      #include "../src/kernels/level3/level3.opencl"
+      #include "../src/kernels/level3/copy_pad.opencl"
+    ;
+  }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+
+  // Tests for valid arguments
+  static void TestValidArguments(const Arguments<T> &) { }
+
+  // Sets the default values for the arguments
+  static size_t DefaultM() { return 1024; }
+  static size_t DefaultN() { return 1024; }
+  static size_t DefaultK() { return 1; } // N/A for this kernel
+  static double DefaultFraction() { return 1.0; } // N/A for this kernel
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
+
+  // Sets the tuning parameters and their possible values
+  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
+    tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32});
+    tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32});
+    tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4});
+    tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4});
+  }
+
+  // Sets the constraints and local memory size
+  static void SetConstraints(cltune::Tuner &, const size_t) { }
+  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
+
+  // Sets the base thread configuration
+  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
+  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
+  static std::vector<size_t> LocalSize() { return {1, 1}; }
+  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
+
+  // Transforms the thread configuration based on the parameters
+  using TransformVector = std::vector<std::vector<std::string>>;
+  static TransformVector MulLocal() { return {{"PAD_DIMX", "PAD_DIMY"}}; }
+  static TransformVector DivLocal() { return {}; }
+  static TransformVector MulGlobal() { return {}; }
+  static TransformVector DivGlobal() { return {{"PAD_WPTX", "PAD_WPTY"}}; }
+
+  // Sets the kernel's arguments
+  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
+                           std::vector<T> &, std::vector<T> &,
+                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
+                           std::vector<T> &) {
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(static_cast<int>(args.n));
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(0);
+    tuner.AddArgumentInput(a_mat);
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(static_cast<int>(args.n));
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(0);
+    tuner.AddArgumentOutput(b_mat);
+    tuner.AddArgumentScalar(0);
+  }
+
+  // Describes how to compute the performance metrics
+  static size_t GetMetric(const Arguments<T> &args) {
+    return 2 * args.m * args.n * GetBytes(args.precision);
+  }
+  static std::string PerformanceUnit() { return "GB/s"; }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePad<half>, half>(argc, argv); break;
+    case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePad<float>, float>(argc, argv); break;
+    case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePad<double>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePad<float2>, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TunePad<double2>, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc
deleted file mode 100644
index 075688db..00000000
--- a/src/tuning/pad.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels.
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-
-#include "internal/utilities.h"
-#include "internal/tuning.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TunePad {
- public:
-
-  // The representative kernel and the source code
-  static std::string KernelFamily() { return "pad"; }
-  static std::string KernelName() { return "PadMatrix"; }
-  static std::string GetSources() {
-    return
-      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/pad.opencl"
-    ;
-  }
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
-
-  // Tests for valid arguments
-  static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the default values for the arguments
-  static size_t DefaultM() { return 1024; }
-  static size_t DefaultN() { return 1024; }
-  static size_t DefaultK() { return 1; } // N/A for this kernel
-  static double DefaultFraction() { return 1.0; } // N/A for this kernel
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
-
-  // Sets the tuning parameters and their possible values
-  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32});
-    tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32});
-    tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4});
-    tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4});
-  }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
-
-  // Sets the base thread configuration
-  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
-  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
-  static std::vector<size_t> LocalSize() { return {1, 1}; }
-  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
-
-  // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
-  static TransformVector MulLocal() { return {{"PAD_DIMX", "PAD_DIMY"}}; }
-  static TransformVector DivLocal() { return {}; }
-  static TransformVector MulGlobal() { return {}; }
-  static TransformVector DivGlobal() { return {{"PAD_WPTX", "PAD_WPTY"}}; }
-
-  // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(0);
-  }
-
-  // Describes how to compute the performance metrics
-  static size_t GetMetric(const Arguments<T> &args) {
-    return 2 * args.m * args.n * GetBytes(args.precision);
-  }
-  static std::string PerformanceUnit() { return "GB/s"; }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePad<half>, half>(argc, argv); break;
-    case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePad<float>, float>(argc, argv); break;
-    case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePad<double>, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePad<float2>, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TunePad<double2>, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc
deleted file mode 100644
index a970f982..00000000
--- a/src/tuning/padtranspose.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels.
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-
-#include "internal/utilities.h"
-#include "internal/tuning.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TunePadTranspose {
- public:
-
-  // The representative kernel and the source code
-  static std::string KernelFamily() { return "padtranspose"; }
-  static std::string KernelName() { return "PadTransposeMatrix"; }
-  static std::string GetSources() {
-    return
-      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/padtranspose.opencl"
-    ;
-  }
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
-
-  // Tests for valid arguments
-  static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the default values for the arguments
-  static size_t DefaultM() { return 1024; }
-  static size_t DefaultN() { return 1024; }
-  static size_t DefaultK() { return 1; } // N/A for this kernel
-  static double DefaultFraction() { return 1.0; } // N/A for this kernel
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
-
-  // Sets the tuning parameters and their possible values
-  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64});
-    tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16});
-    tuner.AddParameter(id, "PADTRA_PAD", {0, 1});
-  }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"});
-  }
-
-  // Sets the base thread configuration
-  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
-  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
-  static std::vector<size_t> LocalSize() { return {1, 1}; }
-  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
-
-  // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
-  static TransformVector MulLocal() { return {{"PADTRA_TILE", "PADTRA_TILE"}}; }
-  static TransformVector DivLocal() { return {}; }
-  static TransformVector MulGlobal() { return {}; }
-  static TransformVector DivGlobal() { return {{"PADTRA_WPT", "PADTRA_WPT"}}; }
-
-  // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(0);
-    tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentScalar(0);
-  }
-
-  // Describes how to compute the performance metrics
-  static size_t GetMetric(const Arguments<T> &args) {
-    return 2 * args.m * args.n * GetBytes(args.precision);
-  }
-  static std::string PerformanceUnit() { return "GB/s"; }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePadTranspose<half>, half>(argc, argv); break;
-    case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePadTranspose<float>, float>(argc, argv); break;
-    case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePadTranspose<double>, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePadTranspose<float2>, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TunePadTranspose<double2>, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc
deleted file mode 100644
index d217a3df..00000000
--- a/src/tuning/transpose.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels.
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-
-#include "internal/utilities.h"
-#include "internal/tuning.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TuneTranspose {
- public:
-
-  // The representative kernel and the source code
-  static std::string KernelFamily() { return "transpose"; }
-  static std::string KernelName() { return "TransposeMatrix"; }
-  static std::string GetSources() {
-    return
-      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/transpose.opencl"
-    ;
-  }
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
-
-  // Tests for valid arguments
-  static void TestValidArguments(const Arguments<T> &) { }
-
-  // Sets the default values for the arguments
-  static size_t DefaultM() { return 1024; }
-  static size_t DefaultN() { return 1024; }
-  static size_t DefaultK() { return 1; } // N/A for this kernel
-  static double DefaultFraction() { return 1.0; } // N/A for this kernel
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
-
-  // Sets the tuning parameters and their possible values
-  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64});
-    tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16});
-    tuner.AddParameter(id, "TRA_PAD", {0, 1});
-    tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1});
-  }
-
-  // Sets the constraints and local memory size
-  static void SetConstraints(cltune::Tuner &, const size_t) { }
-  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
-    };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"});
-  }
-
-  // Sets the base thread configuration
-  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
-  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
-  static std::vector<size_t> LocalSize() { return {1, 1}; }
-  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
-
-  // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
-  static TransformVector MulLocal() { return {{"TRA_DIM", "TRA_DIM"}}; }
-  static TransformVector DivLocal() { return {}; }
-  static TransformVector MulGlobal() { return {}; }
-  static TransformVector DivGlobal() { return {{"TRA_WPT", "TRA_WPT"}}; }
-
-  // Sets the kernel's arguments
-  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-                           std::vector<T> &, std::vector<T> &,
-                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
-                           std::vector<T> &) {
-    tuner.AddArgumentScalar(static_cast<int>(args.m));
-    tuner.AddArgumentInput(a_mat);
-    tuner.AddArgumentOutput(b_mat);
-  }
-
-  // Describes how to compute the performance metrics
-  static size_t GetMetric(const Arguments<T> &args) {
-    return 2 * args.m * args.n * GetBytes(args.precision);
-  }
-  static std::string PerformanceUnit() { return "GB/s"; }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneTranspose<half>, half>(argc, argv); break;
-    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneTranspose<float>, float>(argc, argv); break;
-    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneTranspose<double>, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneTranspose<float2>, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneTranspose<double2>, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/src/tuning/transpose_fast.cc b/src/tuning/transpose_fast.cc
new file mode 100644
index 00000000..1a5260c1
--- /dev/null
+++ b/src/tuning/transpose_fast.cc
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TuneTranspose {
+ public:
+
+  // The representative kernel and the source code
+  static std::string KernelFamily() { return "transpose"; }
+  static std::string KernelName() { return "TransposeMatrixFast"; }
+  static std::string GetSources() {
+    return
+      #include "../src/kernels/common.opencl"
+      #include "../src/kernels/level3/level3.opencl"
+      #include "../src/kernels/level3/transpose_fast.opencl"
+    ;
+  }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+
+  // Tests for valid arguments
+  static void TestValidArguments(const Arguments<T> &) { }
+
+  // Sets the default values for the arguments
+  static size_t DefaultM() { return 1024; }
+  static size_t DefaultN() { return 1024; }
+  static size_t DefaultK() { return 1; } // N/A for this kernel
+  static double DefaultFraction() { return 1.0; } // N/A for this kernel
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
+
+  // Sets the tuning parameters and their possible values
+  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
+    tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64});
+    tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16});
+    tuner.AddParameter(id, "TRA_PAD", {0, 1});
+    tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1});
+  }
+
+  // Sets the constraints and local memory size
+  static void SetConstraints(cltune::Tuner &, const size_t) { }
+  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
+    auto LocalMemorySize = [args] (std::vector<size_t> v) {
+      return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
+    };
+    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"});
+  }
+
+  // Sets the base thread configuration
+  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
+  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
+  static std::vector<size_t> LocalSize() { return {1, 1}; }
+  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
+
+  // Transforms the thread configuration based on the parameters
+  using TransformVector = std::vector<std::vector<std::string>>;
+  static TransformVector MulLocal() { return {{"TRA_DIM", "TRA_DIM"}}; }
+  static TransformVector DivLocal() { return {}; }
+  static TransformVector MulGlobal() { return {}; }
+  static TransformVector DivGlobal() { return {{"TRA_WPT", "TRA_WPT"}}; }
+
+  // Sets the kernel's arguments
+  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
+                           std::vector<T> &, std::vector<T> &,
+                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
+                           std::vector<T> &) {
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentInput(a_mat);
+    tuner.AddArgumentOutput(b_mat);
+  }
+
+  // Describes how to compute the performance metrics
+  static size_t GetMetric(const Arguments<T> &args) {
+    return 2 * args.m * args.n * GetBytes(args.precision);
+  }
+  static std::string PerformanceUnit() { return "GB/s"; }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneTranspose<half>, half>(argc, argv); break;
+    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneTranspose<float>, float>(argc, argv); break;
+    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneTranspose<double>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneTranspose<float2>, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneTranspose<double2>, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/transpose_pad.cc b/src/tuning/transpose_pad.cc
new file mode 100644
index 00000000..08b52510
--- /dev/null
+++ b/src/tuning/transpose_pad.cc
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TunePadTranspose {
+ public:
+
+  // The representative kernel and the source code
+  static std::string KernelFamily() { return "padtranspose"; }
+  static std::string KernelName() { return "TransposePadMatrix"; }
+  static std::string GetSources() {
+    return
+      #include "../src/kernels/common.opencl"
+      #include "../src/kernels/level3/level3.opencl"
+      #include "../src/kernels/level3/transpose_pad.opencl"
+    ;
+  }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+
+  // Tests for valid arguments
+  static void TestValidArguments(const Arguments<T> &) { }
+
+  // Sets the default values for the arguments
+  static size_t DefaultM() { return 1024; }
+  static size_t DefaultN() { return 1024; }
+  static size_t DefaultK() { return 1; } // N/A for this kernel
+  static double DefaultFraction() { return 1.0; } // N/A for this kernel
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
+
+  // Sets the tuning parameters and their possible values
+  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
+    tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64});
+    tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16});
+    tuner.AddParameter(id, "PADTRA_PAD", {0, 1});
+  }
+
+  // Sets the constraints and local memory size
+  static void SetConstraints(cltune::Tuner &, const size_t) { }
+  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
+    auto LocalMemorySize = [args] (std::vector<size_t> v) {
+      return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision));
+    };
+    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"});
+  }
+
+  // Sets the base thread configuration
+  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
+  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
+  static std::vector<size_t> LocalSize() { return {1, 1}; }
+  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
+
+  // Transforms the thread configuration based on the parameters
+  using TransformVector = std::vector<std::vector<std::string>>;
+  static TransformVector MulLocal() { return {{"PADTRA_TILE", "PADTRA_TILE"}}; }
+  static TransformVector DivLocal() { return {}; }
+  static TransformVector MulGlobal() { return {}; }
+  static TransformVector DivGlobal() { return {{"PADTRA_WPT", "PADTRA_WPT"}}; }
+
+  // Sets the kernel's arguments
+  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
+                           std::vector<T> &, std::vector<T> &,
+                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
+                           std::vector<T> &) {
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(static_cast<int>(args.n));
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(0);
+    tuner.AddArgumentInput(a_mat);
+    tuner.AddArgumentScalar(static_cast<int>(args.n));
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(static_cast<int>(args.n));
+    tuner.AddArgumentScalar(0);
+    tuner.AddArgumentOutput(b_mat);
+    tuner.AddArgumentScalar(0);
+  }
+
+  // Describes how to compute the performance metrics
+  static size_t GetMetric(const Arguments<T> &args) {
+    return 2 * args.m * args.n * GetBytes(args.precision);
+  }
+  static std::string PerformanceUnit() { return "GB/s"; }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePadTranspose<half>, half>(argc, argv); break;
+    case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePadTranspose<float>, float>(argc, argv); break;
+    case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePadTranspose<double>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePadTranspose<float2>, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TunePadTranspose<double2>, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
-- 
cgit v1.2.3