summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorD. Van Assche <dimitri.vanassche@macq.eu>2016-08-18 17:33:13 +0200
committerD. Van Assche <dimitri.vanassche@macq.eu>2016-08-18 17:33:13 +0200
commit57f1aa76857cf0566e05b43b9b2a98a3a6139c8b (patch)
tree4ed277095302ea0870b764dde32936dad56bb410 /src
parent7c13bacf129291e3e295ecb6e833788477085fa0 (diff)
Adapt opencl files for 1.1 OpenCL
In OpenCL 1.1 __kernel has to be before __attribute__, at least with Vivante compiler.
Diffstat (limited to 'src')
-rw-r--r--src/kernels/level1/xamax.opencl8
-rw-r--r--src/kernels/level1/xasum.opencl8
-rw-r--r--src/kernels/level1/xaxpy.opencl8
-rw-r--r--src/kernels/level1/xcopy.opencl8
-rw-r--r--src/kernels/level1/xdot.opencl8
-rw-r--r--src/kernels/level1/xnrm2.opencl8
-rw-r--r--src/kernels/level1/xscal.opencl8
-rw-r--r--src/kernels/level1/xswap.opencl8
-rw-r--r--src/kernels/level2/xgemv.opencl4
-rw-r--r--src/kernels/level2/xgemv_fast.opencl8
-rw-r--r--src/kernels/level2/xger.opencl4
-rw-r--r--src/kernels/level2/xher.opencl4
-rw-r--r--src/kernels/level2/xher2.opencl4
-rw-r--r--src/kernels/level3/convert_hermitian.opencl8
-rw-r--r--src/kernels/level3/convert_symmetric.opencl8
-rw-r--r--src/kernels/level3/convert_triangular.opencl8
-rw-r--r--src/kernels/level3/copy_fast.opencl4
-rw-r--r--src/kernels/level3/copy_pad.opencl8
-rw-r--r--src/kernels/level3/transpose_fast.opencl4
-rw-r--r--src/kernels/level3/transpose_pad.opencl8
-rw-r--r--src/kernels/level3/xgemm_part2.opencl12
21 files changed, 74 insertions, 74 deletions
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 48d0eb5c..763c1c0e 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -30,8 +30,8 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xamax(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xamax(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global singlereal* maxgm, __global unsigned int* imaxgm) {
__local singlereal maxlm[WGS1];
@@ -95,8 +95,8 @@ __kernel void Xamax(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XamaxEpilogue(const __global singlereal* restrict maxgm,
const __global unsigned int* restrict imaxgm,
__global unsigned int* imax, const int imax_offset) {
__local singlereal maxlm[WGS2];
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 58d0f11b..1542a187 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -30,8 +30,8 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xasum(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xasum(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
__local real lm[WGS1];
@@ -74,8 +74,8 @@ __kernel void Xasum(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XasumEpilogue(const __global real* restrict input,
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XasumEpilogue(const __global real* restrict input,
__global real* asum, const int asum_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index e0efadc1..73a4a535 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -22,8 +22,8 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xaxpy(const int n, const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
const real alpha = arg_alpha[0];
@@ -40,8 +40,8 @@ __kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
const __global realV* restrict xgm,
__global realV* ygm) {
const real alpha = arg_alpha[0];
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index 97c27ccf..c96c33ac 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -22,8 +22,8 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xcopy(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xcopy(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
@@ -38,8 +38,8 @@ __kernel void Xcopy(const int n,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XcopyFast(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XcopyFast(const int n,
const __global realV* restrict xgm,
__global realV* ygm) {
#pragma unroll
diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl
index e13eb3c1..210aee94 100644
--- a/src/kernels/level1/xdot.opencl
+++ b/src/kernels/level1/xdot.opencl
@@ -30,8 +30,8 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the sum operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xdot(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xdot(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
__global real* output, const int do_conjugate) {
@@ -73,8 +73,8 @@ __kernel void Xdot(const int n,
// The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
// be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XdotEpilogue(const __global real* restrict input,
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XdotEpilogue(const __global real* restrict input,
__global real* dot, const int dot_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index 9803687a..3633efea 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -30,8 +30,8 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xnrm2(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xnrm2(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
__local real lm[WGS1];
@@ -72,8 +72,8 @@ __kernel void Xnrm2(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void Xnrm2Epilogue(const __global real* restrict input,
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void Xnrm2Epilogue(const __global real* restrict input,
__global real* nrm2, const int nrm2_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index 59936776..61ff5bb6 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -22,8 +22,8 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xscal(const int n, const real alpha,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xscal(const int n, const real alpha,
__global real* xgm, const int x_offset, const int x_inc) {
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
@@ -40,8 +40,8 @@ __kernel void Xscal(const int n, const real alpha,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XscalFast(const int n, const real alpha,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XscalFast(const int n, const real alpha,
__global realV* xgm) {
#pragma unroll
for (int w=0; w<WPT; ++w) {
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index f6487b58..40336a67 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -22,8 +22,8 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xswap(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xswap(const int n,
__global real* xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
@@ -40,8 +40,8 @@ __kernel void Xswap(const int n,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XswapFast(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XswapFast(const int n,
__global realV* xgm,
__global realV* ygm) {
#pragma unroll
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 65b4291f..41d44dab 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -210,8 +210,8 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
// =================================================================================================
// Full version of the kernel
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xgemv(const int m, const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xgemv(const int m, const int n,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const int a_rotated,
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 6a494e84..dae31a2b 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -94,8 +94,8 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
// --> 'a_ld' is a multiple of VW2
// --> 'a_rotated' is 0
// --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XgemvFast(const int m, const int n,
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XgemvFast(const int m, const int n,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const int a_rotated,
@@ -196,8 +196,8 @@ __kernel void XgemvFast(const int m, const int n,
// --> 'a_ld' is a multiple of VW3
// --> 'a_rotated' is 1
// --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS3, 1, 1)))
-__kernel void XgemvFastRot(const int m, const int n,
+__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
+void XgemvFastRot(const int m, const int n,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const int a_rotated,
diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl
index 63817afb..21744799 100644
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@@ -18,8 +18,8 @@ R"(
// =================================================================================================
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xger(const int max1, const int max2,
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xger(const int max1, const int max2,
const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* ygm, const int y_offset, const int y_inc,
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index fc635f2e..4b304a9f 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -18,8 +18,8 @@ R"(
// =================================================================================================
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher(const int n,
const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index a66f255f..8d05f020 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -18,8 +18,8 @@ R"(
// =================================================================================================
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher2(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher2(const int n,
const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index 53cc161a..272905eb 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -20,8 +20,8 @@ R"(
// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermLowerToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
@@ -59,8 +59,8 @@ __kernel void HermLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermUpperToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index c6ce93ca..ea6f7dbd 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -20,8 +20,8 @@ R"(
// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmLowerToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
@@ -53,8 +53,8 @@ __kernel void SymmLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmUpperToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index fdd2461a..858228bb 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -20,8 +20,8 @@ R"(
// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaLowerToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
@@ -55,8 +55,8 @@ __kernel void TriaLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaUpperToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index 09e54e6d..54f9d987 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -35,8 +35,8 @@ R"(
// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
-__kernel void CopyMatrixFast(const int ld,
+__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+void CopyMatrixFast(const int ld,
__global const realC* restrict src,
__global realC* dest,
const __constant real* restrict arg_alpha) {
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index d276cc60..92279ecf 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -24,8 +24,8 @@ R"(
// Copies a matrix from source to destination. The output is padded with zero values in case the
// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
// value and offset can be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyPadMatrix(const int src_one, const int src_two,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyPadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
@@ -65,8 +65,8 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
// writes only the actual data back to the destination matrix. Again, the ld value and offset can
// be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyMatrix(const int src_one, const int src_two,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index d5c46a30..a2007408 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -36,8 +36,8 @@ R"(
// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
// offset. A more general version is available in 'padtranspose.opencl'.
-__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
-__kernel void TransposeMatrixFast(const int ld,
+__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+void TransposeMatrixFast(const int ld,
__global const realT* restrict src,
__global realT* dest,
const __constant real* restrict arg_alpha) {
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index 2de0c7bd..63cc6e9a 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -24,8 +24,8 @@ R"(
// Transposes a matrix from source to destination. The output is padded with zero values in case the
// destination matrix dimensions are larger than the transposed source matrix dimensions.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposePadMatrix(const int src_one, const int src_two,
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposePadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
@@ -88,8 +88,8 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
// Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
// padded source matrix, but only the actual data is written back to the transposed destination
// matrix. This kernel optionally checks for upper/lower triangular matrices.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposeMatrix(const int src_one, const int src_two,
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposeMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 42c1127c..60e38c06 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -268,8 +268,8 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
// Main entry point of the kernel. This is the upper-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmUpper(const int kSizeN, const int kSizeK,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const __global realM* restrict agm,
@@ -308,8 +308,8 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
}
// Main entry point of the kernel. This is the lower-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmLower(const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmLower(const int kSizeN, const int kSizeK,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const __global realM* restrict agm,
@@ -352,8 +352,8 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
#else
// Main entry point of the kernel. This is the regular full version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const __global realM* restrict agm,