summaryrefslogtreecommitdiff
path: root/src/kernels/level3
diff options
context:
space:
mode:
Diffstat (limited to 'src/kernels/level3')
-rw-r--r--src/kernels/level3/convert_hermitian.opencl12
-rw-r--r--src/kernels/level3/convert_symmetric.opencl12
-rw-r--r--src/kernels/level3/convert_triangular.opencl12
-rw-r--r--src/kernels/level3/copy_fast.opencl6
-rw-r--r--src/kernels/level3/copy_pad.opencl36
-rw-r--r--src/kernels/level3/invert_diagonal_blocks_part1.opencl6
-rw-r--r--src/kernels/level3/transpose_fast.opencl6
-rw-r--r--src/kernels/level3/transpose_pad.opencl36
-rw-r--r--src/kernels/level3/xgemm_batched.opencl12
-rw-r--r--src/kernels/level3/xgemm_direct_batched.opencl48
-rw-r--r--src/kernels/level3/xgemm_direct_part3.opencl24
-rw-r--r--src/kernels/level3/xgemm_part4.opencl18
12 files changed, 190 insertions, 38 deletions
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index 0e89b78b..4bb61f4e 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -21,7 +21,11 @@ R"(
// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void HermLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -60,7 +64,11 @@ void HermLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void HermUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index 83ecdd65..264bf9c5 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -20,7 +20,11 @@ R"(
// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void SymmLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -53,7 +57,11 @@ void SymmLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void SymmUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index a9d5e769..092e1cf2 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -20,7 +20,11 @@ R"(
// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void TriaLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -55,7 +59,11 @@ void TriaLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void TriaUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index ef8a9017..04e4b2ab 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -35,7 +35,11 @@ R"(
// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
void CopyMatrixFast(const int ld,
__global const realC* restrict src,
__global realC* dest,
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index 3d389b74..dca93b76 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -59,7 +59,11 @@ INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two,
}
// Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyPadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -118,7 +122,11 @@ INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two,
}
// Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -138,7 +146,11 @@ void CopyMatrix(const int src_one, const int src_two,
#if defined(ROUTINE_GEMMBATCHED)
// Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyPadMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
@@ -156,7 +168,11 @@ void CopyPadMatrixBatched(const int src_one, const int src_two,
}
// Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
@@ -177,7 +193,11 @@ void CopyMatrixBatched(const int src_one, const int src_two,
#if defined(ROUTINE_GEMMSTRIDEDBATCHED)
// Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyPadMatrixStridedBatched(const int src_one, const int src_two,
const int src_ld, const int src_offset,
const int src_stride, __global const real* restrict src,
@@ -195,7 +215,11 @@ void CopyPadMatrixStridedBatched(const int src_one, const int src_two,
}
// Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyMatrixStridedBatched(const int src_one, const int src_two,
const int src_ld, const int src_offset,
const int src_stride, __global const real* restrict src,
diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index c1f96bd7..580f7b8b 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -82,7 +82,11 @@ R"(
// =================================================================================================
// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
-__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+#endif
void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld,
__global real* restrict dest, const int outer_block_size,
const int unit_diagonal, const int is_upper)
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index 1b9fca45..9d1b7552 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -36,7 +36,11 @@ R"(
// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
// offset. A more general version is available in 'padtranspose.opencl'.
-__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+#endif
void TransposeMatrixFast(const int ld,
__global const realT* restrict src,
__global realT* dest,
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index e55a8b7c..3877a3d5 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -84,7 +84,11 @@ INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile,
}
// Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposePadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -172,7 +176,11 @@ INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile,
}
// Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposeMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -193,7 +201,11 @@ void TransposeMatrix(const int src_one, const int src_two,
#if defined(ROUTINE_GEMMBATCHED)
// Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposePadMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
@@ -212,7 +224,11 @@ void TransposePadMatrixBatched(const int src_one, const int src_two,
}
// Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposeMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
@@ -234,7 +250,11 @@ void TransposeMatrixBatched(const int src_one, const int src_two,
#if defined(ROUTINE_GEMMSTRIDEDBATCHED)
// Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposePadMatrixStridedBatched(const int src_one, const int src_two,
const int src_ld, const int src_offset,
const int src_stride, __global const real* restrict src,
@@ -253,7 +273,11 @@ void TransposePadMatrixStridedBatched(const int src_one, const int src_two,
}
// Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposeMatrixStridedBatched(const int src_one, const int src_two,
const int src_ld, const int src_offset,
const int src_stride, __global const real* restrict src,
diff --git a/src/kernels/level3/xgemm_batched.opencl b/src/kernels/level3/xgemm_batched.opencl
index b51e6298..41d07d19 100644
--- a/src/kernels/level3/xgemm_batched.opencl
+++ b/src/kernels/level3/xgemm_batched.opencl
@@ -19,7 +19,11 @@ R"(
// =================================================================================================
#if defined(ROUTINE_GEMMBATCHED)
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas,
const __constant real_arg* arg_betas,
@@ -62,7 +66,11 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
// =================================================================================================
#if defined(ROUTINE_GEMMSTRIDEDBATCHED)
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
void XgemmStridedBatched(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realM* restrict agm, const int a_one, const int a_two,
diff --git a/src/kernels/level3/xgemm_direct_batched.opencl b/src/kernels/level3/xgemm_direct_batched.opencl
index d15ed31e..102ae762 100644
--- a/src/kernels/level3/xgemm_direct_batched.opencl
+++ b/src/kernels/level3/xgemm_direct_batched.opencl
@@ -20,7 +20,11 @@ R"(
#if defined(ROUTINE_GEMMBATCHED)
// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -41,7 +45,11 @@ void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -62,7 +70,11 @@ void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -83,7 +95,11 @@ void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -108,7 +124,11 @@ void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
#if defined(ROUTINE_GEMMSTRIDEDBATCHED)
// Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -127,7 +147,11 @@ void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int k
}
// Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -146,7 +170,11 @@ void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int k
}
// Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -165,7 +193,11 @@ void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int k
}
// Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
index 0822c95f..5508170e 100644
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -218,7 +218,11 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize
// =================================================================================================
// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
@@ -233,7 +237,11 @@ void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
@@ -248,7 +256,11 @@ void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
@@ -263,7 +275,11 @@ void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
diff --git a/src/kernels/level3/xgemm_part4.opencl b/src/kernels/level3/xgemm_part4.opencl
index b1f1ade6..05524337 100644
--- a/src/kernels/level3/xgemm_part4.opencl
+++ b/src/kernels/level3/xgemm_part4.opencl
@@ -19,7 +19,11 @@ R"(
#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
// Main entry point of the kernel. This is the upper-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
void XgemmUpper(const int kSizeN, const int kSizeK,
const real_arg arg_alpha,
const real_arg arg_beta,
@@ -55,7 +59,11 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
}
// Main entry point of the kernel. This is the lower-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
void XgemmLower(const int kSizeN, const int kSizeK,
const real_arg arg_alpha,
const real_arg arg_beta,
@@ -95,7 +103,11 @@ void XgemmLower(const int kSizeN, const int kSizeK,
#else
// Main entry point of the kernel. This is the regular full version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha,
const real_arg arg_beta,