diff options
Diffstat (limited to 'src/kernels/level3')
-rw-r--r-- | src/kernels/level3/convert_hermitian.opencl | 12 | ||||
-rw-r--r-- | src/kernels/level3/convert_symmetric.opencl | 12 | ||||
-rw-r--r-- | src/kernels/level3/convert_triangular.opencl | 12 | ||||
-rw-r--r-- | src/kernels/level3/copy_fast.opencl | 6 | ||||
-rw-r--r-- | src/kernels/level3/copy_pad.opencl | 36 | ||||
-rw-r--r-- | src/kernels/level3/invert_diagonal_blocks_part1.opencl | 6 | ||||
-rw-r--r-- | src/kernels/level3/transpose_fast.opencl | 6 | ||||
-rw-r--r-- | src/kernels/level3/transpose_pad.opencl | 36 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_batched.opencl | 12 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_direct_batched.opencl | 48 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_direct_part3.opencl | 24 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part4.opencl | 18 |
12 files changed, 190 insertions, 38 deletions
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl index 0e89b78b..b69be23d 100644 --- a/src/kernels/level3/convert_hermitian.opencl +++ b/src/kernels/level3/convert_hermitian.opencl @@ -21,7 +21,11 @@ R"( // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void HermLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, @@ -60,7 +64,11 @@ void HermLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void HermUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl index 83ecdd65..2ce17f40 100644 --- a/src/kernels/level3/convert_symmetric.opencl +++ b/src/kernels/level3/convert_symmetric.opencl @@ -20,7 +20,11 @@ R"( // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void SymmLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, @@ -53,7 +57,11 @@ void SymmLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void SymmUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl index a9d5e769..563f719f 100644 --- a/src/kernels/level3/convert_triangular.opencl +++ b/src/kernels/level3/convert_triangular.opencl @@ -20,7 +20,11 @@ R"( // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void TriaLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, @@ -55,7 +59,11 @@ void TriaLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void TriaUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl index ef8a9017..e1a815f5 100644 --- a/src/kernels/level3/copy_fast.opencl +++ b/src/kernels/level3/copy_fast.opencl @@ -35,7 +35,11 @@ R"( // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of // COPY_VW. Also requires both matrices to be of the same dimensions and without offset. -__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#endif void CopyMatrixFast(const int ld, __global const realC* restrict src, __global realC* dest, diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl index 3d389b74..6335bd81 100644 --- a/src/kernels/level3/copy_pad.opencl +++ b/src/kernels/level3/copy_pad.opencl @@ -59,7 +59,11 @@ INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two, } // Interface to the above function -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyPadMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, @@ -118,7 +122,11 @@ INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two, } // Interface to the above function -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, @@ -138,7 +146,11 @@ void CopyMatrix(const int src_one, const int src_two, #if defined(ROUTINE_GEMMBATCHED) // Batched version of the above -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyPadMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, @@ -156,7 +168,11 @@ void CopyPadMatrixBatched(const int src_one, const int src_two, } // Batched version of the above -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, @@ -177,7 +193,11 @@ void CopyMatrixBatched(const int src_one, const int src_two, #if defined(ROUTINE_GEMMSTRIDEDBATCHED) // Strided-batched version of the above -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyPadMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, @@ -195,7 +215,11 @@ void CopyPadMatrixStridedBatched(const int src_one, const int src_two, } // Strided-batched version of the above -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl index c1f96bd7..3df477d1 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl @@ -82,7 +82,11 @@ R"( // ================================================================================================= // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix -__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1))) +#endif void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld, __global real* restrict dest, const int outer_block_size, const int unit_diagonal, const int is_upper) diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl index 1b9fca45..e89984cc 100644 --- a/src/kernels/level3/transpose_fast.opencl +++ b/src/kernels/level3/transpose_fast.opencl @@ -36,7 +36,11 @@ R"( // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without // offset. A more general version is available in 'padtranspose.opencl'. -__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +#endif void TransposeMatrixFast(const int ld, __global const realT* restrict src, __global realT* dest, diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index e55a8b7c..31de0e62 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -84,7 +84,11 @@ INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile, } // Interface to the above function -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposePadMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, @@ -172,7 +176,11 @@ INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile, } // Interface to the above function -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposeMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, @@ -193,7 +201,11 @@ void TransposeMatrix(const int src_one, const int src_two, #if defined(ROUTINE_GEMMBATCHED) // Batched version of the above -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposePadMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, @@ -212,7 +224,11 @@ void TransposePadMatrixBatched(const int src_one, const int src_two, } // Batched version of the above -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposeMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, @@ -234,7 +250,11 @@ void TransposeMatrixBatched(const int src_one, const int src_two, #if defined(ROUTINE_GEMMSTRIDEDBATCHED) // Strided-batched version of the above -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposePadMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, @@ -253,7 +273,11 @@ void TransposePadMatrixStridedBatched(const int src_one, const int src_two, } // Strided-batched version of the above -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposeMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, diff --git a/src/kernels/level3/xgemm_batched.opencl b/src/kernels/level3/xgemm_batched.opencl index b51e6298..e014b7a9 100644 --- a/src/kernels/level3/xgemm_batched.opencl +++ b/src/kernels/level3/xgemm_batched.opencl @@ -19,7 +19,11 @@ R"( // ================================================================================================= #if defined(ROUTINE_GEMMBATCHED) -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#endif void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, @@ -62,7 +66,11 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK, // ================================================================================================= #if defined(ROUTINE_GEMMSTRIDEDBATCHED) -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#endif void XgemmStridedBatched(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realM* restrict agm, const int a_one, const int a_two, diff --git a/src/kernels/level3/xgemm_direct_batched.opencl b/src/kernels/level3/xgemm_direct_batched.opencl index d15ed31e..ec0b008b 100644 --- a/src/kernels/level3/xgemm_direct_batched.opencl +++ b/src/kernels/level3/xgemm_direct_batched.opencl @@ -20,7 +20,11 @@ R"( #if defined(ROUTINE_GEMMBATCHED) // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, @@ -41,7 +45,11 @@ void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, @@ -62,7 +70,11 @@ void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, @@ -83,7 +95,11 @@ void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, @@ -108,7 +124,11 @@ void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, #if defined(ROUTINE_GEMMSTRIDEDBATCHED) // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, @@ -127,7 +147,11 @@ void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int k } // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, @@ -146,7 +170,11 @@ void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int k } // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, @@ -165,7 +193,11 @@ void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int k } // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectStridedBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl index 0822c95f..66b880e6 100644 --- a/src/kernels/level3/xgemm_direct_part3.opencl +++ b/src/kernels/level3/xgemm_direct_part3.opencl @@ -218,7 +218,11 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize // ================================================================================================= // Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, @@ -233,7 +237,11 @@ void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, @@ -248,7 +256,11 @@ void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, @@ -263,7 +275,11 @@ void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the GEMM kernel with [A, B] = [transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, diff --git a/src/kernels/level3/xgemm_part4.opencl b/src/kernels/level3/xgemm_part4.opencl index b1f1ade6..a64e2efa 100644 --- a/src/kernels/level3/xgemm_part4.opencl +++ b/src/kernels/level3/xgemm_part4.opencl @@ -19,7 +19,11 @@ R"( #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) // Main entry point of the kernel. This is the upper-triangular version. -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#endif void XgemmUpper(const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, @@ -55,7 +59,11 @@ void XgemmUpper(const int kSizeN, const int kSizeK, } // Main entry point of the kernel. This is the lower-triangular version. -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#endif void XgemmLower(const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, @@ -95,7 +103,11 @@ void XgemmLower(const int kSizeN, const int kSizeK, #else // Main entry point of the kernel. This is the regular full version. -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#endif void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, |