12 files changed, 190 insertions, 38 deletions
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index 0e89b78b..b69be23d 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -21,7 +21,11 @@ R"(
 
 // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void HermLowerToSquared(const int src_dim,
                         const int src_ld, const int src_offset,
                         __global const real* restrict src,
@@ -60,7 +64,11 @@ void HermLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void HermUpperToSquared(const int src_dim,
                         const int src_ld, const int src_offset,
                         __global const real* restrict src,
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index 83ecdd65..2ce17f40 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -20,7 +20,11 @@ R"(
 
 // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void SymmLowerToSquared(const int src_dim,
                         const int src_ld, const int src_offset,
                         __global const real* restrict src,
@@ -53,7 +57,11 @@ void SymmLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void SymmUpperToSquared(const int src_dim,
                         const int src_ld, const int src_offset,
                         __global const real* restrict src,
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index a9d5e769..563f719f 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -20,7 +20,11 @@ R"(
 
 // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void TriaLowerToSquared(const int src_dim,
                         const int src_ld, const int src_offset,
                         __global const real* restrict src,
@@ -55,7 +59,11 @@ void TriaLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void TriaUpperToSquared(const int src_dim,
                         const int src_ld, const int src_offset,
                         __global const real* restrict src,
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index ef8a9017..e1a815f5 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -35,7 +35,11 @@ R"(
 
 // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
 // COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
 void CopyMatrixFast(const int ld,
                     __global const realC* restrict src,
                     __global realC* dest,
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index 3d389b74..6335bd81 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -59,7 +59,11 @@ INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two,
 }
 
 // Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyPadMatrix(const int src_one, const int src_two,
                    const int src_ld, const int src_offset,
                    __global const real* restrict src,
@@ -118,7 +122,11 @@ INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two,
 }
 
 // Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyMatrix(const int src_one, const int src_two,
                 const int src_ld, const int src_offset,
                 __global const real* restrict src,
@@ -138,7 +146,11 @@ void CopyMatrix(const int src_one, const int src_two,
 #if defined(ROUTINE_GEMMBATCHED)
 
 // Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyPadMatrixBatched(const int src_one, const int src_two,
                           const int src_ld, const __constant int* src_offsets,
                           __global const real* restrict src,
@@ -156,7 +168,11 @@ void CopyPadMatrixBatched(const int src_one, const int src_two,
 }
 
 // Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyMatrixBatched(const int src_one, const int src_two,
                        const int src_ld, const __constant int* src_offsets,
                        __global const real* restrict src,
@@ -177,7 +193,11 @@ void CopyMatrixBatched(const int src_one, const int src_two,
 #if defined(ROUTINE_GEMMSTRIDEDBATCHED)
 
 // Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyPadMatrixStridedBatched(const int src_one, const int src_two,
                                  const int src_ld, const int src_offset,
                                  const int src_stride, __global const real* restrict src,
@@ -195,7 +215,11 @@ void CopyPadMatrixStridedBatched(const int src_one, const int src_two,
 }
 
 // Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
 void CopyMatrixStridedBatched(const int src_one, const int src_two,
                               const int src_ld, const int src_offset,
                               const int src_stride, __global const real* restrict src,
diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index c1f96bd7..3df477d1 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -82,7 +82,11 @@ R"(
 // =================================================================================================
 
 // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
-__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+#endif
 void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld,
                          __global real* restrict dest, const int outer_block_size,
                          const int unit_diagonal, const int is_upper)
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index 1b9fca45..e89984cc 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -36,7 +36,11 @@ R"(
 
 // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
 // offset. A more general version is available in 'padtranspose.opencl'.
-__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+#endif
 void TransposeMatrixFast(const int ld,
                          __global const realT* restrict src,
                          __global realT* dest,
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index e55a8b7c..31de0e62 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -84,7 +84,11 @@ INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile,
 }
 
 // Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposePadMatrix(const int src_one, const int src_two,
                         const int src_ld, const int src_offset,
                         __global const real* restrict src,
@@ -172,7 +176,11 @@ INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile,
 }
 
 // Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposeMatrix(const int src_one, const int src_two,
                      const int src_ld, const int src_offset,
                      __global const real* restrict src,
@@ -193,7 +201,11 @@ void TransposeMatrix(const int src_one, const int src_two,
 #if defined(ROUTINE_GEMMBATCHED)
 
 // Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposePadMatrixBatched(const int src_one, const int src_two,
                                const int src_ld, const __constant int* src_offsets,
                                __global const real* restrict src,
@@ -212,7 +224,11 @@ void TransposePadMatrixBatched(const int src_one, const int src_two,
 }
 
 // Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposeMatrixBatched(const int src_one, const int src_two,
                             const int src_ld, const __constant int* src_offsets,
                             __global const real* restrict src,
@@ -234,7 +250,11 @@ void TransposeMatrixBatched(const int src_one, const int src_two,
 #if defined(ROUTINE_GEMMSTRIDEDBATCHED)
 
 // Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposePadMatrixStridedBatched(const int src_one, const int src_two,
                                       const int src_ld, const int src_offset,
                                       const int src_stride, __global const real* restrict src,
@@ -253,7 +273,11 @@ void TransposePadMatrixStridedBatched(const int src_one, const int src_two,
 }
 
 // Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
 void TransposeMatrixStridedBatched(const int src_one, const int src_two,
                                    const int src_ld, const int src_offset,
                                    const int src_stride, __global const real* restrict src,
diff --git a/src/kernels/level3/xgemm_batched.opencl b/src/kernels/level3/xgemm_batched.opencl
index b51e6298..e014b7a9 100644
--- a/src/kernels/level3/xgemm_batched.opencl
+++ b/src/kernels/level3/xgemm_batched.opencl
@@ -19,7 +19,11 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_GEMMBATCHED)
 
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
 void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
                   const __constant real_arg* arg_alphas,
                   const __constant real_arg* arg_betas,
@@ -62,7 +66,11 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
 // =================================================================================================
 #if defined(ROUTINE_GEMMSTRIDEDBATCHED)
 
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
 void XgemmStridedBatched(const int kSizeM, const int kSizeN, const int kSizeK,
                          const real_arg arg_alpha, const real_arg arg_beta,
                          const __global realM* restrict agm, const int a_one, const int a_two,
diff --git a/src/kernels/level3/xgemm_direct_batched.opencl b/src/kernels/level3/xgemm_direct_batched.opencl
index d15ed31e..ec0b008b 100644
--- a/src/kernels/level3/xgemm_direct_batched.opencl
+++ b/src/kernels/level3/xgemm_direct_batched.opencl
@@ -20,7 +20,11 @@ R"(
 #if defined(ROUTINE_GEMMBATCHED)
 
 // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
                           const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                           const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -41,7 +45,11 @@ void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
 }
 
 // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
                           const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                           const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -62,7 +70,11 @@ void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
 }
 
 // Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
                           const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                           const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -83,7 +95,11 @@ void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
 }
 
 // Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
                           const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
                           const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -108,7 +124,11 @@ void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
 #if defined(ROUTINE_GEMMSTRIDEDBATCHED)
 
 // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
                                  const real_arg arg_alpha, const real_arg arg_beta,
                                  const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -127,7 +147,11 @@ void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int k
 }
 
 // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
                                  const real_arg arg_alpha, const real_arg arg_beta,
                                  const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -146,7 +170,11 @@ void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int k
 }
 
 // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
                                  const real_arg arg_alpha, const real_arg arg_beta,
                                  const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -165,7 +193,11 @@ void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int k
 }
 
 // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectStridedBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
                                  const real_arg arg_alpha, const real_arg arg_beta,
                                  const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
index 0822c95f..66b880e6 100644
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -218,7 +218,11 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize
 // =================================================================================================
 
 // Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
                             const real_arg arg_alpha, const real_arg arg_beta,
                             const __global realMD* restrict agm, const int a_offset, const int a_ld,
@@ -233,7 +237,11 @@ void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
 }
 
 // Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
                             const real_arg arg_alpha, const real_arg arg_beta,
                             const __global realMD* restrict agm, const int a_offset, const int a_ld,
@@ -248,7 +256,11 @@ void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
 }
 
 // Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
                             const real_arg arg_alpha, const real_arg arg_beta,
                             const __global realMD* restrict agm, const int a_offset, const int a_ld,
@@ -263,7 +275,11 @@ void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
 }
 
 // Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
 void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
                             const real_arg arg_alpha, const real_arg arg_beta,
                             const __global realMD* restrict agm, const int a_offset, const int a_ld,
diff --git a/src/kernels/level3/xgemm_part4.opencl b/src/kernels/level3/xgemm_part4.opencl
index b1f1ade6..a64e2efa 100644
--- a/src/kernels/level3/xgemm_part4.opencl
+++ b/src/kernels/level3/xgemm_part4.opencl
@@ -19,7 +19,11 @@ R"(
 #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
 
 // Main entry point of the kernel. This is the upper-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
 void XgemmUpper(const int kSizeN, const int kSizeK,
                 const real_arg arg_alpha,
                 const real_arg arg_beta,
@@ -55,7 +59,11 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
 }
 
 // Main entry point of the kernel. This is the lower-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
 void XgemmLower(const int kSizeN, const int kSizeK,
                 const real_arg arg_alpha,
                 const real_arg arg_beta,
@@ -95,7 +103,11 @@ void XgemmLower(const int kSizeN, const int kSizeK,
 #else
 
 // Main entry point of the kernel. This is the regular full version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
 void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
            const real_arg arg_alpha,
            const real_arg arg_beta,