24 files changed, 615 insertions, 525 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 08c47d87..223501fd 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -109,6 +109,16 @@ R"(
   typedef real singlereal;
 #endif
 
+// Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no
+// conversion, but half-precision is not supported as kernel argument so it is converted from float.
+#if PRECISION == 16
+  typedef float real_arg;
+  #define GetRealArg(x) (half)x
+#else
+  typedef real real_arg;
+  #define GetRealArg(x) x
+#endif
+
 // =================================================================================================
 
 // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
@@ -138,6 +148,13 @@ R"(
   #define SetToOne(a) a = ONE
 #endif
 
+// Determines whether a variable is zero
+#if PRECISION == 3232 || PRECISION == 6464
+  #define IsZero(a) ((a.x == ZERO) && (a.y == ZERO))
+#else
+  #define IsZero(a) (a == ZERO)
+#endif
+
 // The absolute value (component-wise)
 #if PRECISION == 3232 || PRECISION == 6464
   #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y)
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 48d0eb5c..48ad2e75 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -30,10 +30,10 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xamax(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global singlereal* maxgm, __global unsigned int* imaxgm) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xamax(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global singlereal* maxgm, __global unsigned int* imaxgm) {
   __local singlereal maxlm[WGS1];
   __local unsigned int imaxlm[WGS1];
   const int lid = get_local_id(0);
@@ -95,10 +95,10 @@ __kernel void Xamax(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
-                            const __global unsigned int* restrict imaxgm,
-                            __global unsigned int* imax, const int imax_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XamaxEpilogue(const __global singlereal* restrict maxgm,
+                   const __global unsigned int* restrict imaxgm,
+                   __global unsigned int* imax, const int imax_offset) {
   __local singlereal maxlm[WGS2];
   __local unsigned int imaxlm[WGS2];
   const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 58d0f11b..1fc91be8 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -30,10 +30,10 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xasum(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* output) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xasum(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* output) {
   __local real lm[WGS1];
   const int lid = get_local_id(0);
   const int wgid = get_group_id(0);
@@ -74,9 +74,9 @@ __kernel void Xasum(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XasumEpilogue(const __global real* restrict input,
-                            __global real* asum, const int asum_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XasumEpilogue(const __global real* restrict input,
+                   __global real* asum, const int asum_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
 
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index e0efadc1..ece8476e 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -22,11 +22,11 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* ygm, const int y_offset, const int y_inc) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xaxpy(const int n, const real_arg arg_alpha,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* ygm, const int y_offset, const int y_inc) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -40,11 +40,11 @@ __kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
-                        const __global realV* restrict xgm,
-                        __global realV* ygm) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFast(const int n, const real_arg arg_alpha,
+               const __global realV* restrict xgm,
+               __global realV* ygm) {
+  const real alpha = GetRealArg(arg_alpha);
 
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index 97c27ccf..228e0735 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -22,10 +22,10 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xcopy(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* ygm, const int y_offset, const int y_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xcopy(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* ygm, const int y_offset, const int y_inc) {
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -38,10 +38,10 @@ __kernel void Xcopy(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XcopyFast(const int n,
-                        const __global realV* restrict xgm,
-                        __global realV* ygm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XcopyFast(const int n,
+               const __global realV* restrict xgm,
+               __global realV* ygm) {
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
     const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl
index e13eb3c1..02f04ea7 100644
--- a/src/kernels/level1/xdot.opencl
+++ b/src/kernels/level1/xdot.opencl
@@ -30,11 +30,11 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the sum operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xdot(const int n,
-                   const __global real* restrict xgm, const int x_offset, const int x_inc,
-                   const __global real* restrict ygm, const int y_offset, const int y_inc,
-                   __global real* output, const int do_conjugate) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xdot(const int n,
+          const __global real* restrict xgm, const int x_offset, const int x_inc,
+          const __global real* restrict ygm, const int y_offset, const int y_inc,
+          __global real* output, const int do_conjugate) {
   __local real lm[WGS1];
   const int lid = get_local_id(0);
   const int wgid = get_group_id(0);
@@ -73,9 +73,9 @@ __kernel void Xdot(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XdotEpilogue(const __global real* restrict input,
-                           __global real* dot, const int dot_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XdotEpilogue(const __global real* restrict input,
+                  __global real* dot, const int dot_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
 
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index 9803687a..f6d869cb 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -30,10 +30,10 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xnrm2(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* output) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xnrm2(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* output) {
   __local real lm[WGS1];
   const int lid = get_local_id(0);
   const int wgid = get_group_id(0);
@@ -72,9 +72,9 @@ __kernel void Xnrm2(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void Xnrm2Epilogue(const __global real* restrict input,
-                            __global real* nrm2, const int nrm2_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void Xnrm2Epilogue(const __global real* restrict input,
+                   __global real* nrm2, const int nrm2_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
 
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index 59936776..3da9c2fd 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -22,9 +22,10 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xscal(const int n, const real alpha,
-                    __global real* xgm, const int x_offset, const int x_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xscal(const int n, const real_arg arg_alpha,
+           __global real* xgm, const int x_offset, const int x_inc) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -40,9 +41,11 @@ __kernel void Xscal(const int n, const real alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XscalFast(const int n, const real alpha,
-                        __global realV* xgm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XscalFast(const int n, const real_arg arg_alpha,
+               __global realV* xgm) {
+  const real alpha = GetRealArg(arg_alpha);
+
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
     const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index f6487b58..267271c0 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -22,10 +22,10 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xswap(const int n,
-                    __global real* xgm, const int x_offset, const int x_inc,
-                    __global real* ygm, const int y_offset, const int y_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xswap(const int n,
+           __global real* xgm, const int x_offset, const int x_inc,
+           __global real* ygm, const int y_offset, const int y_inc) {
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -40,10 +40,10 @@ __kernel void Xswap(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XswapFast(const int n,
-                        __global realV* xgm,
-                        __global realV* ygm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XswapFast(const int n,
+               __global realV* xgm,
+               __global realV* ygm) {
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
     const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 65b4291f..ff011acd 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -210,18 +210,18 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
 // =================================================================================================
 
 // Full version of the kernel
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xgemv(const int m, const int n,
-                    const __constant real* restrict arg_alpha,
-                    const __constant real* restrict arg_beta,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xgemv(const int m, const int n,
+                    const real_arg arg_alpha,
+                    const real_arg arg_beta,
                     const int a_rotated,
                     const __global real* restrict agm, const int a_offset, const int a_ld,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc,
                     const int do_conjugate, const int parameter,
                     const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Local memory for the vector X
   __local real xlm[WGS1];
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 6a494e84..02a1f956 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -38,7 +38,7 @@ R"(
   #define WGS3 64     // The local work-group size
 #endif
 #ifndef WPT3
-  #define WPT3 1      // The amount of work-per-thread
+  #define WPT3 1      // The tile-size
 #endif
 #ifndef VW3
   #define VW3 1       // Vector width of matrix A loads
@@ -74,18 +74,12 @@ R"(
 
 // =================================================================================================
 
-// Loads a vector input value (1/2)
+// Loads a vector input value
 inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
                             const int a_ld) {
   return agm[a_ld*y + x];
 }
 
-// Loads a vector input value (2/2): as before, but different data-type
-inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
-                              const int a_ld) {
-  return agm[a_ld*y + x];
-}
-
 // =================================================================================================
 
 // Faster version of the kernel, assuming that:
@@ -94,23 +88,23 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
 // --> 'a_ld' is a multiple of VW2
 // --> 'a_rotated' is 0
 // --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XgemvFast(const int m, const int n,
-                        const __constant real* restrict arg_alpha,
-                        const __constant real* restrict arg_beta,
-                        const int a_rotated,
-                        const __global realVF* restrict agm, const int a_offset, const int a_ld,
-                        const __global real* restrict xgm, const int x_offset, const int x_inc,
-                        __global real* ygm, const int y_offset, const int y_inc,
-                        const int do_conjugate, const int parameter,
-                        const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XgemvFast(const int m, const int n,
+               const real_arg arg_alpha,
+               const real_arg arg_beta,
+               const int a_rotated,
+               const __global realVF* restrict agm, const int a_offset, const int a_ld,
+               const __global real* restrict xgm, const int x_offset, const int x_inc,
+               __global real* ygm, const int y_offset, const int y_inc,
+               const int do_conjugate, const int parameter,
+               const int kl_unused, const int ku_unused) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Local memory for the vector X
   __local real xlm[WGS2];
 
-  // Initializes the accumulation register
+  // Initializes the accumulation registers
   real acc[WPT2];
   #pragma unroll
   for (int w=0; w<WPT2; ++w) {
@@ -134,7 +128,7 @@ __kernel void XgemvFast(const int m, const int n,
       #pragma unroll
       for (int w=0; w<WPT2/VW2; ++w) {
         const int gid = (WPT2/VW2)*get_global_id(0) + w;
-        realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
+        realVF avec = agm[(a_ld/VW2)*k + gid];
         #if VW2 == 1
           MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
         #elif VW2 == 2
@@ -196,84 +190,96 @@ __kernel void XgemvFast(const int m, const int n,
 // --> 'a_ld' is a multiple of VW3
 // --> 'a_rotated' is 1
 // --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS3, 1, 1)))
-__kernel void XgemvFastRot(const int m, const int n,
-                           const __constant real* restrict arg_alpha,
-                           const __constant real* restrict arg_beta,
-                           const int a_rotated,
-                           const __global realVFR* restrict agm, const int a_offset, const int a_ld,
-                           const __global real* restrict xgm, const int x_offset, const int x_inc,
-                           __global real* ygm, const int y_offset, const int y_inc,
-                           const int do_conjugate, const int parameter,
-                           const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
+void XgemvFastRot(const int m, const int n,
+                  const real_arg arg_alpha,
+                  const real_arg arg_beta,
+                  const int a_rotated,
+                  const __global realVFR* restrict agm, const int a_offset, const int a_ld,
+                  const __global real* restrict xgm, const int x_offset, const int x_inc,
+                  __global real* ygm, const int y_offset, const int y_inc,
+                  const int do_conjugate, const int parameter,
+                  const int kl_unused, const int ku_unused) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[WPT3][WGS3];
+  const int lid = get_local_id(0);
+  const int lid_mod = lid % (WPT3/VW3);
+  const int lid_div = lid / (WPT3/VW3);
 
   // Local memory for the vector X
-  __local real xlm[WGS3];
+  __local real xlm[WPT3];
 
   // Initializes the accumulation register
-  real acc[WPT3];
-  #pragma unroll
-  for (int w=0; w<WPT3; ++w) {
-    SetToZero(acc[w]);
-  }
+  real acc;
+  SetToZero(acc);
 
-  // Loops over work-group sized portions of the work
-  for (int kwg=0; kwg<n; kwg+=WGS3) {
+  // Loops over tile-sized portions of the work
+  for (int kwg=0; kwg<n; kwg+=WPT3) {
 
     // Loads the vector X into local memory
-    const int lid = get_local_id(0);
-    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
+    if (lid < WPT3) {
+      xlm[lid] = xgm[(kwg + lid) * x_inc + x_offset];
+    }
+
+    // Loads the matrix A into local memory
+    #pragma unroll
+    for (int kl=0; kl<WPT3/VW3; ++kl) {
+      const int x = (kwg/VW3) + lid_mod;
+      const int y = get_group_id(0) * WGS3 + lid_div * (WPT3/VW3) + kl;
+      realVFR avec = agm[(a_ld/VW3) * y + x];
+      #if VW3 == 1
+        tile[kl*VW3 + 0][lid] = avec;
+      #elif VW3 == 2
+        tile[kl*VW3 + 0][lid] = avec.x;
+        tile[kl*VW3 + 1][lid] = avec.y;
+      #elif VW3 == 4
+        tile[kl*VW3 + 0][lid] = avec.x;
+        tile[kl*VW3 + 1][lid] = avec.y;
+        tile[kl*VW3 + 2][lid] = avec.z;
+        tile[kl*VW3 + 3][lid] = avec.w;
+      #elif VW3 == 8
+        tile[kl*VW3 + 0][lid] = avec.s0;
+        tile[kl*VW3 + 1][lid] = avec.s1;
+        tile[kl*VW3 + 2][lid] = avec.s2;
+        tile[kl*VW3 + 3][lid] = avec.s3;
+        tile[kl*VW3 + 4][lid] = avec.s4;
+        tile[kl*VW3 + 5][lid] = avec.s5;
+        tile[kl*VW3 + 6][lid] = avec.s6;
+        tile[kl*VW3 + 7][lid] = avec.s7;
+      #elif VW3 == 16
+        tile[kl*VW3 + 0][lid] = avec.s0;
+        tile[kl*VW3 + 1][lid] = avec.s1;
+        tile[kl*VW3 + 2][lid] = avec.s2;
+        tile[kl*VW3 + 3][lid] = avec.s3;
+        tile[kl*VW3 + 4][lid] = avec.s4;
+        tile[kl*VW3 + 5][lid] = avec.s5;
+        tile[kl*VW3 + 6][lid] = avec.s6;
+        tile[kl*VW3 + 7][lid] = avec.s7;
+        tile[kl*VW3 + 8][lid] = avec.s8;
+        tile[kl*VW3 + 9][lid] = avec.s9;
+        tile[kl*VW3 + 10][lid] = avec.sA;
+        tile[kl*VW3 + 11][lid] = avec.sB;
+        tile[kl*VW3 + 12][lid] = avec.sC;
+        tile[kl*VW3 + 13][lid] = avec.sD;
+        tile[kl*VW3 + 14][lid] = avec.sE;
+        tile[kl*VW3 + 15][lid] = avec.sF;
+      #endif
+    }
 
     // Synchronizes all threads in a workgroup
     barrier(CLK_LOCAL_MEM_FENCE);
 
     // The multiply-add function (rotated)
     #pragma unroll
-    for (int kl=0; kl<WGS3/VW3; ++kl) {
-      const int k = (kwg/VW3) + kl;
+    for (int kl=0; kl<WPT3/VW3; ++kl) {
       #pragma unroll
-      for (int w=0; w<WPT3; ++w) {
-        const int gid = WPT3*get_global_id(0) + w;
-        realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
-        #if VW3 == 1
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
-        #elif VW3 == 2
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
-        #elif VW3 == 4
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
-        #elif VW3 == 8
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
-          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
-          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
-          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
-          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
-        #elif VW3 == 16
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
-          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
-          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
-          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
-          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
-          MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
-          MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
-          MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
-          MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
-          MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
-          MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
-          MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
-          MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
-        #endif
+      for (int v=0; v<VW3; ++v) {
+        real aval = tile[lid_mod*VW3 + v][lid_div * (WPT3/VW3) + kl];
+        real xval = xlm[kl*VW3 + v];
+        MultiplyAdd(acc, xval, aval);
       }
     }
 
@@ -282,12 +288,9 @@ __kernel void XgemvFastRot(const int m, const int n,
   }
 
   // Stores the final result
-  #pragma unroll
-  for (int w=0; w<WPT3; ++w) {
-    const int gid = WPT3*get_global_id(0) + w;
-    real yval = ygm[gid*y_inc + y_offset];
-    AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
-  }
+  const int gid = get_global_id(0);
+  real yval = ygm[gid * y_inc + y_offset];
+  AXPBY(ygm[gid * y_inc + y_offset], alpha, acc, beta, yval);
 }
 
 // =================================================================================================
diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl
index 63817afb..1b9ded12 100644
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@@ -18,14 +18,14 @@ R"(
 // =================================================================================================
 
 // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xger(const int max1, const int max2,
-                   const __constant real* restrict arg_alpha,
-                   const __global real* restrict xgm, const int x_offset, const int x_inc,
-                   const __global real* ygm, const int y_offset, const int y_inc,
-                   __global real* restrict agm, const int a_offset, const int a_ld,
-                   const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xger(const int max1, const int max2,
+          const real_arg arg_alpha,
+          const __global real* restrict xgm, const int x_offset, const int x_inc,
+          const __global real* ygm, const int y_offset, const int y_inc,
+          __global real* restrict agm, const int a_offset, const int a_ld,
+          const int is_rowmajor) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and Y
   real xvalues[WPT];
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index fc635f2e..b0772218 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -18,13 +18,13 @@ R"(
 // =================================================================================================
 
 // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher(const int n,
-                   const __constant real* restrict arg_alpha,
-                   const __global real* restrict xgm, const int x_offset, const int x_inc,
-                   __global real* restrict agm, const int a_offset, const int a_ld,
-                   const int is_upper, const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher(const int n,
+          const real_arg arg_alpha,
+          const __global real* restrict xgm, const int x_offset, const int x_inc,
+          __global real* restrict agm, const int a_offset, const int a_ld,
+          const int is_upper, const int is_rowmajor) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and XT
   real xvalues[WPT];
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index a66f255f..00a756c9 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -18,14 +18,14 @@ R"(
 // =================================================================================================
 
 // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher2(const int n,
-                    const __constant real* restrict arg_alpha,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    const __global real* restrict ygm, const int y_offset, const int y_inc,
-                    __global real* restrict agm, const int a_offset, const int a_ld,
-                    const int is_upper, const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher2(const int n,
+           const real_arg arg_alpha,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           const __global real* restrict ygm, const int y_offset, const int y_inc,
+           __global real* restrict agm, const int a_offset, const int a_ld,
+           const int is_upper, const int is_rowmajor) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and Y
   real xvalues[WPT];
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index 53cc161a..ed2ded98 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -20,13 +20,13 @@ R"(
 
 // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermLowerToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -59,13 +59,13 @@ __kernel void HermLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermUpperToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index c6ce93ca..8ae53b37 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -20,13 +20,13 @@ R"(
 
 // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmLowerToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -53,13 +53,13 @@ __kernel void SymmLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmUpperToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index fdd2461a..f848dcc1 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -20,14 +20,14 @@ R"(
 
 // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaLowerToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const int unit_diagonal) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -55,14 +55,14 @@ __kernel void TriaLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaUpperToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const int unit_diagonal) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index 09e54e6d..695b9003 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -35,12 +35,12 @@ R"(
 
 // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
 // COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
-__kernel void CopyMatrixFast(const int ld,
-                             __global const realC* restrict src,
-                             __global realC* dest,
-                             const __constant real* restrict arg_alpha) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+void CopyMatrixFast(const int ld,
+                    __global const realC* restrict src,
+                    __global realC* dest,
+                    const real_arg arg_alpha) {
+  const real alpha = GetRealArg(arg_alpha);
   #pragma unroll
   for (int w_one=0; w_one<COPY_WPT; ++w_one) {
     const int id_one = get_global_id(0);
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index d276cc60..29480b25 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -24,16 +24,16 @@ R"(
 // Copies a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
 // value and offset can be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyPadMatrix(const int src_one, const int src_two,
-                            const int src_ld, const int src_offset,
-                            __global const real* restrict src,
-                            const int dest_one, const int dest_two,
-                            const int dest_ld, const int dest_offset,
-                            __global real* dest,
-                            const __constant real* restrict arg_alpha,
-                            const int do_conjugate) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyPadMatrix(const int src_one, const int src_two,
+                   const int src_ld, const int src_offset,
+                   __global const real* restrict src,
+                   const int dest_one, const int dest_two,
+                   const int dest_ld, const int dest_offset,
+                   __global real* dest,
+                   const real_arg arg_alpha,
+                   const int do_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -65,17 +65,17 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
 // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
 // writes only the actual data back to the destination matrix. Again, the ld value and offset can
 // be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyMatrix(const int src_one, const int src_two,
-                         const int src_ld, const int src_offset,
-                         __global const real* restrict src,
-                         const int dest_one, const int dest_two,
-                         const int dest_ld, const int dest_offset,
-                         __global real* dest,
-                         const __constant real* restrict arg_alpha,
-                         const int upper, const int lower,
-                         const int diagonal_imag_zero) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyMatrix(const int src_one, const int src_two,
+                const int src_ld, const int src_offset,
+                __global const real* restrict src,
+                const int dest_one, const int dest_two,
+                const int dest_ld, const int dest_offset,
+                __global real* dest,
+                const real_arg arg_alpha,
+                const int upper, const int lower,
+                const int diagonal_imag_zero) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index d5c46a30..70156d3a 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -36,12 +36,12 @@ R"(
 
 // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
 // offset. A more general version is available in 'padtranspose.opencl'.
-__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
-__kernel void TransposeMatrixFast(const int ld,
-                                  __global const realT* restrict src,
-                                  __global realT* dest,
-                                  const __constant real* restrict arg_alpha) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+void TransposeMatrixFast(const int ld,
+                         __global const realT* restrict src,
+                         __global realT* dest,
+                         const real_arg arg_alpha) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
   // way over workgroups, breaking memory-bank dependencies.
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index 2de0c7bd..ba0b7062 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -24,16 +24,16 @@ R"(
 
 // Transposes a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the transposed source matrix dimensions.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposePadMatrix(const int src_one, const int src_two,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_one, const int dest_two,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const __constant real* restrict arg_alpha,
-                                 const int do_conjugate) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposePadMatrix(const int src_one, const int src_two,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_one, const int dest_two,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const real_arg arg_alpha,
+                        const int do_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -88,17 +88,17 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
 // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
 // padded source matrix, but only the actual data is written back to the transposed destination
 // matrix. This kernel optionally checks for upper/lower triangular matrices.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposeMatrix(const int src_one, const int src_two,
-                              const int src_ld, const int src_offset,
-                              __global const real* restrict src,
-                              const int dest_one, const int dest_two,
-                              const int dest_ld, const int dest_offset,
-                              __global real* dest,
-                              const __constant real* restrict arg_alpha,
-                              const int upper, const int lower,
-                              const int diagonal_imag_zero) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposeMatrix(const int src_one, const int src_two,
+                     const int src_ld, const int src_offset,
+                     __global const real* restrict src,
+                     const int dest_one, const int dest_two,
+                     const int dest_ld, const int dest_offset,
+                     __global real* dest,
+                     const real_arg arg_alpha,
+                     const int upper, const int lower,
+                     const int diagonal_imag_zero) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 1ad0a558..d0ce06ad 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -31,7 +31,7 @@
 //    o-------o        o-----o  
 //                              
 //
-// This kernel is seperated into two files. This is part 1 out of 2.
+// This kernel is seperated into three files. This is part 1 out of 3.
 //
 // =================================================================================================
 
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 42c1127c..e8234a29 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This is part 2 of 2 of the GEMM kernel. See part 1 for more information.
+// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
 //
 // =================================================================================================
 
@@ -133,260 +133,98 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
       #endif
       int idm = mg + GetGroupID0() * (MWG/VWM);
       int idn = ng + GetGroupID1() * NWG;
-
-      // The final multiplication with alpha and the addition with beta*C
       int index = idn*(kSizeM/VWM) + idm;
+
       realM result;
       realM xval = cpm[ni][mi];
-      realM yval = cgm[index];
-      #if VWM == 1
-        AXPBY(result, alpha, xval, beta, yval);
-      #elif VWM == 2
-        AXPBY(result.x, alpha, xval.x, beta, yval.x);
-        AXPBY(result.y, alpha, xval.y, beta, yval.y);
-      #elif VWM == 4
-        AXPBY(result.x, alpha, xval.x, beta, yval.x);
-        AXPBY(result.y, alpha, xval.y, beta, yval.y);
-        AXPBY(result.z, alpha, xval.z, beta, yval.z);
-        AXPBY(result.w, alpha, xval.w, beta, yval.w);
-      #elif VWM == 8
-        AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
-        AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
-        AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
-        AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
-        AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
-        AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
-        AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
-        AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
-      #elif VWM == 16
-        AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
-        AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
-        AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
-        AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
-        AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
-        AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
-        AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
-        AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
-        AXPBY(result.s8, alpha, xval.s8, beta, yval.s8);
-        AXPBY(result.s9, alpha, xval.s9, beta, yval.s9);
-        AXPBY(result.sA, alpha, xval.sA, beta, yval.sA);
-        AXPBY(result.sB, alpha, xval.sB, beta, yval.sB);
-        AXPBY(result.sC, alpha, xval.sC, beta, yval.sC);
-        AXPBY(result.sD, alpha, xval.sD, beta, yval.sD);
-        AXPBY(result.sE, alpha, xval.sE, beta, yval.sE);
-        AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
-      #endif
-      cgm[index] = result;
-    }
-  }
-}
-
-// =================================================================================================
-
-// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
-inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
-                      const __global realM* restrict agm, const __global realN* restrict bgm,
-                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
-                      #if SA == 1 && SB == 1
-                        , __local realM* alm, __local realN* blm
-                      #elif SA == 1
-                        , __local realM* alm
-                      #elif SB == 1
-                        , __local realN* blm
-                      #endif
-                      ) {
-
-  // Allocates workitem-private memory (registers)
-  realM apm[MWI/VWM];
-  realN bpm[NWI/VWN];
-
-  // Combined thread identifier (volatile to disable caching)
-  #if SA == 1 || SB == 1
-    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
-  #endif
-
-  // Initializes the accumulation registers
-  InitAccRegisters(cpm);
-
-  // Loops over all workgroup tiles
-  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
 
-    // Loads data: off-chip --> local (matrix A)
-    #if SA == 1
-      GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
-    #endif
-    // Loads data: off-chip --> local (matrix B)
-    #if SB == 1
-      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
-    #endif
-    #if SA == 1 || SB == 1
-      barrier(CLK_LOCAL_MEM_FENCE);
-    #endif
-
-    // Loops over all workitem tiles, unrolled by a factor KWI
-    for (int pwi=0; pwi<KWG; pwi+=KWI) {
-      #pragma unroll
-      for (int pit=0; pit<KWI; ++pit) {
-        #if SA == 0 || SB == 0
-          int idk = kwg + pwi + pit;
-        #endif
-        #if SA == 1 || SB == 1
-          int kg = pwi+pit;
-        #endif
-
-        // Loads data: local --> private (matrix A)
-        #if SA == 1
-          LocalToPrivateA(alm, apm, kg);
-        // Loads data: off-chip --> private (matrix A)
-        #else
-          GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+      // The final multiplication with alpha (in case beta == 0)
+      if (IsZero(beta)) {
+        #if VWM == 1
+          Multiply(result, alpha, xval);
+        #elif VWM == 2
+          Multiply(result.x, alpha, xval.x);
+          Multiply(result.y, alpha, xval.y);
+        #elif VWM == 4
+          Multiply(result.x, alpha, xval.x);
+          Multiply(result.y, alpha, xval.y);
+          Multiply(result.z, alpha, xval.z);
+          Multiply(result.w, alpha, xval.w);
+        #elif VWM == 8
+          Multiply(result.s0, alpha, xval.s0);
+          Multiply(result.s1, alpha, xval.s1);
+          Multiply(result.s2, alpha, xval.s2);
+          Multiply(result.s3, alpha, xval.s3);
+          Multiply(result.s4, alpha, xval.s4);
+          Multiply(result.s5, alpha, xval.s5);
+          Multiply(result.s6, alpha, xval.s6);
+          Multiply(result.s7, alpha, xval.s7);
+        #elif VWM == 16
+          Multiply(result.s0, alpha, xval.s0);
+          Multiply(result.s1, alpha, xval.s1);
+          Multiply(result.s2, alpha, xval.s2);
+          Multiply(result.s3, alpha, xval.s3);
+          Multiply(result.s4, alpha, xval.s4);
+          Multiply(result.s5, alpha, xval.s5);
+          Multiply(result.s6, alpha, xval.s6);
+          Multiply(result.s7, alpha, xval.s7);
+          Multiply(result.s8, alpha, xval.s8);
+          Multiply(result.s9, alpha, xval.s9);
+          Multiply(result.sA, alpha, xval.sA);
+          Multiply(result.sB, alpha, xval.sB);
+          Multiply(result.sC, alpha, xval.sC);
+          Multiply(result.sD, alpha, xval.sD);
+          Multiply(result.sE, alpha, xval.sE);
+          Multiply(result.sF, alpha, xval.sF);
         #endif
+      }
 
-        // Loads data: local --> private (matrix B)
-        #if SB == 1
-          LocalToPrivateB(blm, bpm, kg);
-        // Loads data: off-chip --> private (matrix B)
-        #else
-          GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+      // The final multiplication with alpha and the addition with beta*C
+      else {
+        realM yval = cgm[index];
+        #if VWM == 1
+          AXPBY(result, alpha, xval, beta, yval);
+        #elif VWM == 2
+          AXPBY(result.x, alpha, xval.x, beta, yval.x);
+          AXPBY(result.y, alpha, xval.y, beta, yval.y);
+        #elif VWM == 4
+          AXPBY(result.x, alpha, xval.x, beta, yval.x);
+          AXPBY(result.y, alpha, xval.y, beta, yval.y);
+          AXPBY(result.z, alpha, xval.z, beta, yval.z);
+          AXPBY(result.w, alpha, xval.w, beta, yval.w);
+        #elif VWM == 8
+          AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
+          AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
+          AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
+          AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
+          AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
+          AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
+          AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
+          AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
+        #elif VWM == 16
+          AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
+          AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
+          AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
+          AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
+          AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
+          AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
+          AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
+          AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
+          AXPBY(result.s8, alpha, xval.s8, beta, yval.s8);
+          AXPBY(result.s9, alpha, xval.s9, beta, yval.s9);
+          AXPBY(result.sA, alpha, xval.sA, beta, yval.sA);
+          AXPBY(result.sB, alpha, xval.sB, beta, yval.sB);
+          AXPBY(result.sC, alpha, xval.sC, beta, yval.sC);
+          AXPBY(result.sD, alpha, xval.sD, beta, yval.sD);
+          AXPBY(result.sE, alpha, xval.sE, beta, yval.sE);
+          AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
         #endif
-
-        // Performs the accumulation (Cpm += Apm * Bpm)
-        MultiplyAccumulate(cpm, apm, bpm);
       }
+      cgm[index] = result;
     }
-    #if SA == 1 || SB == 1
-      barrier(CLK_LOCAL_MEM_FENCE);
-    #endif
-  }
-  #if GLOBAL_MEM_FENCE == 1
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  #endif
-}
-
-// =================================================================================================
-// The upper-triangular and lower-triangular kernels are only used in special cases
-#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
-
-// Main entry point of the kernel. This is the upper-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
-                         const __constant real* restrict arg_alpha,
-                         const __constant real* restrict arg_beta,
-                         const __global realM* restrict agm,
-                         const __global realN* restrict bgm,
-                         __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
-
-  // Skip these threads if they do not contain threads contributing to the upper-triangle
-  if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
-    return;
-  }
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// Main entry point of the kernel. This is the lower-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmLower(const int kSizeN, const int kSizeK,
-                         const __constant real* restrict arg_alpha,
-                         const __constant real* restrict arg_beta,
-                         const __global realM* restrict agm,
-                         const __global realN* restrict bgm,
-                         __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
-
-  // Skip these threads if they do not contain threads contributing to the lower-triangle
-  if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
-    return;
   }
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// =================================================================================================
-// If not using a triangular version, include the regular kernel
-#else
-
-// Main entry point of the kernel. This is the regular full version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const __constant real* restrict arg_alpha,
-                    const __constant real* restrict arg_beta,
-                    const __global realM* restrict agm,
-                    const __global realN* restrict bgm,
-                    __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeM, alpha, beta);
 }
 
-#endif
 // =================================================================================================
 
 // End of the C++11 raw string literal
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
new file mode 100644
index 00000000..a5faef5a
--- /dev/null
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -0,0 +1,229 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+                      const __global realM* restrict agm, const __global realN* restrict bgm,
+                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
+                      #if SA == 1 && SB == 1
+                        , __local realM* alm, __local realN* blm
+                      #elif SA == 1
+                        , __local realM* alm
+                      #elif SB == 1
+                        , __local realN* blm
+                      #endif
+                      ) {
+
+  // Allocates workitem-private memory (registers)
+  realM apm[MWI/VWM];
+  realN bpm[NWI/VWN];
+
+  // Combined thread identifier (volatile to disable caching)
+  #if SA == 1 || SB == 1
+    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
+  #endif
+
+  // Initializes the accumulation registers
+  InitAccRegisters(cpm);
+
+  // Loops over all workgroup tiles
+  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
+
+    // Loads data: off-chip --> local (matrix A)
+    #if SA == 1
+      GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
+    #endif
+    // Loads data: off-chip --> local (matrix B)
+    #if SB == 1
+      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
+    #endif
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+
+    // Loops over all workitem tiles, unrolled by a factor KWI
+    for (int pwi=0; pwi<KWG; pwi+=KWI) {
+      #pragma unroll
+      for (int pit=0; pit<KWI; ++pit) {
+        #if SA == 0 || SB == 0
+          int idk = kwg + pwi + pit;
+        #endif
+        #if SA == 1 || SB == 1
+          int kg = pwi+pit;
+        #endif
+
+        // Loads data: local --> private (matrix A)
+        #if SA == 1
+          LocalToPrivateA(alm, apm, kg);
+        // Loads data: off-chip --> private (matrix A)
+        #else
+          GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+        #endif
+
+        // Loads data: local --> private (matrix B)
+        #if SB == 1
+          LocalToPrivateB(blm, bpm, kg);
+        // Loads data: off-chip --> private (matrix B)
+        #else
+          GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+        #endif
+
+        // Performs the accumulation (Cpm += Apm * Bpm)
+        MultiplyAccumulate(cpm, apm, bpm);
+      }
+    }
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+  }
+  #if GLOBAL_MEM_FENCE == 1
+    barrier(CLK_GLOBAL_MEM_FENCE);
+  #endif
+}
+
+// =================================================================================================
+// The upper-triangular and lower-triangular kernels are only used in special cases
+#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
+
+// Main entry point of the kernel. This is the upper-triangular version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmUpper(const int kSizeN, const int kSizeK,
+                const real_arg arg_alpha,
+                const real_arg arg_beta,
+                const __global realM* restrict agm,
+                const __global realN* restrict bgm,
+                __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Skip these threads if they do not contain threads contributing to the upper-triangle
+  if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// Main entry point of the kernel. This is the lower-triangular version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmLower(const int kSizeN, const int kSizeK,
+                const real_arg arg_alpha,
+                const real_arg arg_beta,
+                const __global realM* restrict agm,
+                const __global realN* restrict bgm,
+                __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Skip these threads if they do not contain threads contributing to the lower-triangle
+  if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// =================================================================================================
+// If not using a triangular version, include the regular kernel
+#else
+
+// Main entry point of the kernel. This is the regular full version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+           const real_arg arg_alpha,
+           const real_arg arg_beta,
+           const __global realM* restrict agm,
+           const __global realN* restrict bgm,
+           __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeM, alpha, beta);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================