23 files changed, 220 insertions, 215 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d018e211..7973655b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -6,6 +6,7 @@ Development version (next release)
 - Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems
 - Fixed a bug related to the cache and retrieval of programs based on the OpenCL context
 - Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels
+- Fixed a bug in the OpenCL kernels: now placing __kernel before __attribute__
 - Added an option (-warm_up) to do a warm-up run before timing in the performance clients
 - Improved performance significantly of rotated GEMV computations
 - Added tuned parameters for various devices (see README)
diff --git a/README.md b/README.md
index 1b3adcb9..04a8e470 100644
--- a/README.md
+++ b/README.md
@@ -286,6 +286,7 @@ The contributing authors (code, pull requests, testing) so far are:
 * [Marco Hutter](https://github.com/gpus)
 * [Hugh Perkins](https://github.com/hughperkins)
 * [Gian-Carlo Pascutto](https://github.com/gcp)
+* [Dimitri VA](https://github.com/dvasschemacq)
 
 Tuning and testing on a variety of OpenCL devices was made possible by:
 
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 48d0eb5c..48ad2e75 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -30,10 +30,10 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xamax(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global singlereal* maxgm, __global unsigned int* imaxgm) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xamax(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global singlereal* maxgm, __global unsigned int* imaxgm) {
   __local singlereal maxlm[WGS1];
   __local unsigned int imaxlm[WGS1];
   const int lid = get_local_id(0);
@@ -95,10 +95,10 @@ __kernel void Xamax(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
-                            const __global unsigned int* restrict imaxgm,
-                            __global unsigned int* imax, const int imax_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XamaxEpilogue(const __global singlereal* restrict maxgm,
+                   const __global unsigned int* restrict imaxgm,
+                   __global unsigned int* imax, const int imax_offset) {
   __local singlereal maxlm[WGS2];
   __local unsigned int imaxlm[WGS2];
   const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 58d0f11b..1fc91be8 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -30,10 +30,10 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xasum(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* output) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xasum(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* output) {
   __local real lm[WGS1];
   const int lid = get_local_id(0);
   const int wgid = get_group_id(0);
@@ -74,9 +74,9 @@ __kernel void Xasum(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XasumEpilogue(const __global real* restrict input,
-                            __global real* asum, const int asum_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XasumEpilogue(const __global real* restrict input,
+                   __global real* asum, const int asum_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
 
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index d533041b..ece8476e 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -22,10 +22,10 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xaxpy(const int n, const real_arg arg_alpha,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* ygm, const int y_offset, const int y_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xaxpy(const int n, const real_arg arg_alpha,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* ygm, const int y_offset, const int y_inc) {
   const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
@@ -40,10 +40,10 @@ __kernel void Xaxpy(const int n, const real_arg arg_alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XaxpyFast(const int n, const real_arg arg_alpha,
-                        const __global realV* restrict xgm,
-                        __global realV* ygm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFast(const int n, const real_arg arg_alpha,
+               const __global realV* restrict xgm,
+               __global realV* ygm) {
   const real alpha = GetRealArg(arg_alpha);
 
   #pragma unroll
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index 97c27ccf..228e0735 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -22,10 +22,10 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xcopy(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* ygm, const int y_offset, const int y_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xcopy(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* ygm, const int y_offset, const int y_inc) {
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -38,10 +38,10 @@ __kernel void Xcopy(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XcopyFast(const int n,
-                        const __global realV* restrict xgm,
-                        __global realV* ygm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XcopyFast(const int n,
+               const __global realV* restrict xgm,
+               __global realV* ygm) {
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
     const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl
index e13eb3c1..02f04ea7 100644
--- a/src/kernels/level1/xdot.opencl
+++ b/src/kernels/level1/xdot.opencl
@@ -30,11 +30,11 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the sum operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xdot(const int n,
-                   const __global real* restrict xgm, const int x_offset, const int x_inc,
-                   const __global real* restrict ygm, const int y_offset, const int y_inc,
-                   __global real* output, const int do_conjugate) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xdot(const int n,
+          const __global real* restrict xgm, const int x_offset, const int x_inc,
+          const __global real* restrict ygm, const int y_offset, const int y_inc,
+          __global real* output, const int do_conjugate) {
   __local real lm[WGS1];
   const int lid = get_local_id(0);
   const int wgid = get_group_id(0);
@@ -73,9 +73,9 @@ __kernel void Xdot(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XdotEpilogue(const __global real* restrict input,
-                           __global real* dot, const int dot_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XdotEpilogue(const __global real* restrict input,
+                  __global real* dot, const int dot_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
 
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index 9803687a..f6d869cb 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -30,10 +30,10 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xnrm2(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* output) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xnrm2(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* output) {
   __local real lm[WGS1];
   const int lid = get_local_id(0);
   const int wgid = get_group_id(0);
@@ -72,9 +72,9 @@ __kernel void Xnrm2(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void Xnrm2Epilogue(const __global real* restrict input,
-                            __global real* nrm2, const int nrm2_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void Xnrm2Epilogue(const __global real* restrict input,
+                   __global real* nrm2, const int nrm2_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
 
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index 59936776..3da9c2fd 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -22,9 +22,10 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xscal(const int n, const real alpha,
-                    __global real* xgm, const int x_offset, const int x_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xscal(const int n, const real_arg arg_alpha,
+           __global real* xgm, const int x_offset, const int x_inc) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -40,9 +41,11 @@ __kernel void Xscal(const int n, const real alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XscalFast(const int n, const real alpha,
-                        __global realV* xgm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XscalFast(const int n, const real_arg arg_alpha,
+               __global realV* xgm) {
+  const real alpha = GetRealArg(arg_alpha);
+
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
     const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index f6487b58..267271c0 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -22,10 +22,10 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xswap(const int n,
-                    __global real* xgm, const int x_offset, const int x_inc,
-                    __global real* ygm, const int y_offset, const int y_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xswap(const int n,
+           __global real* xgm, const int x_offset, const int x_inc,
+           __global real* ygm, const int y_offset, const int y_inc) {
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -40,10 +40,10 @@ __kernel void Xswap(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XswapFast(const int n,
-                        __global realV* xgm,
-                        __global realV* ygm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XswapFast(const int n,
+               __global realV* xgm,
+               __global realV* ygm) {
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
     const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 83b6b15d..ff011acd 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -210,8 +210,8 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
 // =================================================================================================
 
 // Full version of the kernel
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xgemv(const int m, const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xgemv(const int m, const int n,
                     const real_arg arg_alpha,
                     const real_arg arg_beta,
                     const int a_rotated,
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 210c42c1..02a1f956 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -88,16 +88,16 @@ inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, co
 // --> 'a_ld' is a multiple of VW2
 // --> 'a_rotated' is 0
 // --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XgemvFast(const int m, const int n,
-                        const real_arg arg_alpha,
-                        const real_arg arg_beta,
-                        const int a_rotated,
-                        const __global realVF* restrict agm, const int a_offset, const int a_ld,
-                        const __global real* restrict xgm, const int x_offset, const int x_inc,
-                        __global real* ygm, const int y_offset, const int y_inc,
-                        const int do_conjugate, const int parameter,
-                        const int kl_unused, const int ku_unused) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XgemvFast(const int m, const int n,
+               const real_arg arg_alpha,
+               const real_arg arg_beta,
+               const int a_rotated,
+               const __global realVF* restrict agm, const int a_offset, const int a_ld,
+               const __global real* restrict xgm, const int x_offset, const int x_inc,
+               __global real* ygm, const int y_offset, const int y_inc,
+               const int do_conjugate, const int parameter,
+               const int kl_unused, const int ku_unused) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
@@ -190,16 +190,16 @@ __kernel void XgemvFast(const int m, const int n,
 // --> 'a_ld' is a multiple of VW3
 // --> 'a_rotated' is 1
 // --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS3, 1, 1)))
-__kernel void XgemvFastRot(const int m, const int n,
-                           const real_arg arg_alpha,
-                           const real_arg arg_beta,
-                           const int a_rotated,
-                           const __global realVFR* restrict agm, const int a_offset, const int a_ld,
-                           const __global real* restrict xgm, const int x_offset, const int x_inc,
-                           __global real* ygm, const int y_offset, const int y_inc,
-                           const int do_conjugate, const int parameter,
-                           const int kl_unused, const int ku_unused) {
+__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
+void XgemvFastRot(const int m, const int n,
+                  const real_arg arg_alpha,
+                  const real_arg arg_beta,
+                  const int a_rotated,
+                  const __global realVFR* restrict agm, const int a_offset, const int a_ld,
+                  const __global real* restrict xgm, const int x_offset, const int x_inc,
+                  __global real* ygm, const int y_offset, const int y_inc,
+                  const int do_conjugate, const int parameter,
+                  const int kl_unused, const int ku_unused) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl
index f218a346..1b9ded12 100644
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@@ -18,13 +18,13 @@ R"(
 // =================================================================================================
 
 // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xger(const int max1, const int max2,
-                   const real_arg arg_alpha,
-                   const __global real* restrict xgm, const int x_offset, const int x_inc,
-                   const __global real* ygm, const int y_offset, const int y_inc,
-                   __global real* restrict agm, const int a_offset, const int a_ld,
-                   const int is_rowmajor) {
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xger(const int max1, const int max2,
+          const real_arg arg_alpha,
+          const __global real* restrict xgm, const int x_offset, const int x_inc,
+          const __global real* ygm, const int y_offset, const int y_inc,
+          __global real* restrict agm, const int a_offset, const int a_ld,
+          const int is_rowmajor) {
   const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and Y
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index 1200ee63..b0772218 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -18,12 +18,12 @@ R"(
 // =================================================================================================
 
 // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher(const int n,
-                   const real_arg arg_alpha,
-                   const __global real* restrict xgm, const int x_offset, const int x_inc,
-                   __global real* restrict agm, const int a_offset, const int a_ld,
-                   const int is_upper, const int is_rowmajor) {
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher(const int n,
+          const real_arg arg_alpha,
+          const __global real* restrict xgm, const int x_offset, const int x_inc,
+          __global real* restrict agm, const int a_offset, const int a_ld,
+          const int is_upper, const int is_rowmajor) {
   const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and XT
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index d0f41571..00a756c9 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -18,13 +18,13 @@ R"(
 // =================================================================================================
 
 // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher2(const int n,
-                    const real_arg arg_alpha,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    const __global real* restrict ygm, const int y_offset, const int y_inc,
-                    __global real* restrict agm, const int a_offset, const int a_ld,
-                    const int is_upper, const int is_rowmajor) {
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher2(const int n,
+           const real_arg arg_alpha,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           const __global real* restrict ygm, const int y_offset, const int y_inc,
+           __global real* restrict agm, const int a_offset, const int a_ld,
+           const int is_upper, const int is_rowmajor) {
   const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and Y
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index 53cc161a..ed2ded98 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -20,13 +20,13 @@ R"(
 
 // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermLowerToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -59,13 +59,13 @@ __kernel void HermLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermUpperToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index c6ce93ca..8ae53b37 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -20,13 +20,13 @@ R"(
 
 // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmLowerToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -53,13 +53,13 @@ __kernel void SymmLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmUpperToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index fdd2461a..f848dcc1 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -20,14 +20,14 @@ R"(
 
 // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaLowerToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const int unit_diagonal) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -55,14 +55,14 @@ __kernel void TriaLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaUpperToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const int unit_diagonal) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index dd975bf1..695b9003 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -35,11 +35,11 @@ R"(
 
 // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
 // COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
-__kernel void CopyMatrixFast(const int ld,
-                             __global const realC* restrict src,
-                             __global realC* dest,
-                             const real_arg arg_alpha) {
+__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+void CopyMatrixFast(const int ld,
+                    __global const realC* restrict src,
+                    __global realC* dest,
+                    const real_arg arg_alpha) {
   const real alpha = GetRealArg(arg_alpha);
   #pragma unroll
   for (int w_one=0; w_one<COPY_WPT; ++w_one) {
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index d0771c31..29480b25 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -24,15 +24,15 @@ R"(
 // Copies a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
 // value and offset can be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyPadMatrix(const int src_one, const int src_two,
-                            const int src_ld, const int src_offset,
-                            __global const real* restrict src,
-                            const int dest_one, const int dest_two,
-                            const int dest_ld, const int dest_offset,
-                            __global real* dest,
-                            const real_arg arg_alpha,
-                            const int do_conjugate) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyPadMatrix(const int src_one, const int src_two,
+                   const int src_ld, const int src_offset,
+                   __global const real* restrict src,
+                   const int dest_one, const int dest_two,
+                   const int dest_ld, const int dest_offset,
+                   __global real* dest,
+                   const real_arg arg_alpha,
+                   const int do_conjugate) {
   const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
@@ -65,16 +65,16 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
 // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
 // writes only the actual data back to the destination matrix. Again, the ld value and offset can
 // be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyMatrix(const int src_one, const int src_two,
-                         const int src_ld, const int src_offset,
-                         __global const real* restrict src,
-                         const int dest_one, const int dest_two,
-                         const int dest_ld, const int dest_offset,
-                         __global real* dest,
-                         const real_arg arg_alpha,
-                         const int upper, const int lower,
-                         const int diagonal_imag_zero) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyMatrix(const int src_one, const int src_two,
+                const int src_ld, const int src_offset,
+                __global const real* restrict src,
+                const int dest_one, const int dest_two,
+                const int dest_ld, const int dest_offset,
+                __global real* dest,
+                const real_arg arg_alpha,
+                const int upper, const int lower,
+                const int diagonal_imag_zero) {
   const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index ea343533..70156d3a 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -36,11 +36,11 @@ R"(
 
 // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
 // offset. A more general version is available in 'padtranspose.opencl'.
-__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
-__kernel void TransposeMatrixFast(const int ld,
-                                  __global const realT* restrict src,
-                                  __global realT* dest,
-                                  const real_arg arg_alpha) {
+__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+void TransposeMatrixFast(const int ld,
+                         __global const realT* restrict src,
+                         __global realT* dest,
+                         const real_arg arg_alpha) {
   const real alpha = GetRealArg(arg_alpha);
 
   // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index 2e20d667..ba0b7062 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -24,15 +24,15 @@ R"(
 
 // Transposes a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the transposed source matrix dimensions.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposePadMatrix(const int src_one, const int src_two,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_one, const int dest_two,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const real_arg arg_alpha,
-                                 const int do_conjugate) {
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposePadMatrix(const int src_one, const int src_two,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_one, const int dest_two,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const real_arg arg_alpha,
+                        const int do_conjugate) {
   const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
@@ -88,16 +88,16 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
 // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
 // padded source matrix, but only the actual data is written back to the transposed destination
 // matrix. This kernel optionally checks for upper/lower triangular matrices.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposeMatrix(const int src_one, const int src_two,
-                              const int src_ld, const int src_offset,
-                              __global const real* restrict src,
-                              const int dest_one, const int dest_two,
-                              const int dest_ld, const int dest_offset,
-                              __global real* dest,
-                              const real_arg arg_alpha,
-                              const int upper, const int lower,
-                              const int diagonal_imag_zero) {
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposeMatrix(const int src_one, const int src_two,
+                     const int src_ld, const int src_offset,
+                     __global const real* restrict src,
+                     const int dest_one, const int dest_two,
+                     const int dest_ld, const int dest_offset,
+                     __global real* dest,
+                     const real_arg arg_alpha,
+                     const int upper, const int lower,
+                     const int diagonal_imag_zero) {
   const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 87e28cb5..a1559b54 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -268,13 +268,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
 #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
 
 // Main entry point of the kernel. This is the upper-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
-                         const real_arg arg_alpha,
-                         const real_arg arg_beta,
-                         const __global realM* restrict agm,
-                         const __global realN* restrict bgm,
-                         __global realM* cgm) {
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmUpper(const int kSizeN, const int kSizeK,
+                const real_arg arg_alpha,
+                const real_arg arg_beta,
+                const __global realM* restrict agm,
+                const __global realN* restrict bgm,
+                __global realM* cgm) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
@@ -308,13 +308,13 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
 }
 
 // Main entry point of the kernel. This is the lower-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmLower(const int kSizeN, const int kSizeK,
-                         const real_arg arg_alpha,
-                         const real_arg arg_beta,
-                         const __global realM* restrict agm,
-                         const __global realN* restrict bgm,
-                         __global realM* cgm) {
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmLower(const int kSizeN, const int kSizeK,
+                const real_arg arg_alpha,
+                const real_arg arg_beta,
+                const __global realM* restrict agm,
+                const __global realN* restrict bgm,
+                __global realM* cgm) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
@@ -352,13 +352,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
 #else
 
 // Main entry point of the kernel. This is the regular full version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const real_arg arg_alpha,
-                    const real_arg arg_beta,
-                    const __global realM* restrict agm,
-                    const __global realN* restrict bgm,
-                    __global realM* cgm) {
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+           const real_arg arg_alpha,
+           const real_arg arg_beta,
+           const __global realM* restrict agm,
+           const __global realN* restrict bgm,
+           __global realM* cgm) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);