From 73f49e9b3d4abc4214122e4b8c07a736e01626ee Mon Sep 17 00:00:00 2001 From: "Angus, Alexander" Date: Tue, 17 Jan 2023 08:35:29 -0800 Subject: Updated according to feedback from CNugteren --- Adreno_730_tuning_results.zip | Bin 59988 -> 0 bytes Adreno_740_tuning_results.zip | Bin 59507 -> 0 bytes CHANGELOG | 5 +++++ README.md | 2 -- src/kernels/level1/xamax.opencl | 4 ++-- src/kernels/level1/xasum.opencl | 4 ++-- src/kernels/level1/xaxpy.opencl | 8 ++++---- src/kernels/level1/xcopy.opencl | 4 ++-- src/kernels/level1/xdot.opencl | 4 ++-- src/kernels/level1/xhad.opencl | 6 +++--- src/kernels/level1/xnrm2.opencl | 4 ++-- src/kernels/level1/xscal.opencl | 4 ++-- src/kernels/level1/xswap.opencl | 4 ++-- src/kernels/level2/xgemv.opencl | 2 +- src/kernels/level2/xgemv_fast.opencl | 4 ++-- src/kernels/level2/xger.opencl | 2 +- src/kernels/level2/xher.opencl | 2 +- src/kernels/level2/xher2.opencl | 2 +- src/kernels/level2/xtrsv.opencl | 4 ++-- src/kernels/level3/convert_hermitian.opencl | 4 ++-- src/kernels/level3/convert_symmetric.opencl | 4 ++-- src/kernels/level3/convert_triangular.opencl | 4 ++-- src/kernels/level3/copy_fast.opencl | 2 +- src/kernels/level3/copy_pad.opencl | 12 ++++++------ src/kernels/level3/invert_diagonal_blocks_part1.opencl | 2 +- src/kernels/level3/transpose_fast.opencl | 2 +- src/kernels/level3/transpose_pad.opencl | 12 ++++++------ src/kernels/level3/xgemm_batched.opencl | 4 ++-- src/kernels/level3/xgemm_direct_batched.opencl | 16 ++++++++-------- src/kernels/level3/xgemm_direct_part3.opencl | 8 ++++---- src/kernels/level3/xgemm_part4.opencl | 6 +++--- src/kernels/levelx/col2im.opencl | 4 ++-- src/kernels/levelx/im2col.opencl | 4 ++-- src/kernels/levelx/xconvgemm_part2.opencl | 6 +++--- src/utilities/compile.cpp | 4 ++-- src/utilities/utilities.cpp | 2 +- 36 files changed, 82 insertions(+), 79 deletions(-) delete mode 100644 Adreno_730_tuning_results.zip delete mode 100644 Adreno_740_tuning_results.zip diff --git a/Adreno_730_tuning_results.zip b/Adreno_730_tuning_results.zip deleted file mode 100644 index 6fa29af5..00000000 Binary files a/Adreno_730_tuning_results.zip and /dev/null differ diff --git a/Adreno_740_tuning_results.zip b/Adreno_740_tuning_results.zip deleted file mode 100644 index 6a5622b8..00000000 Binary files a/Adreno_740_tuning_results.zip and /dev/null differ diff --git a/CHANGELOG b/CHANGELOG index 7088fb49..824f4520 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,10 @@ Development version (next version) - Fixes two small issues in the plotting script +- Modifications to improve performance on Qualcomm Adreno GPUs: + * Unique database entries for specific Adreno devices + * Toggle OpenCL kernel compilation options for Adreno + * New preprocessor directive RELAX_WORKGROUP_SIZE +- Fixed a bug in handling of #undef in CLBlast loop unrolling and array-to-register mapping functions Version 1.5.3 - Fix a correctness issue with DGEMM on SM 7.5 Turing GPUs diff --git a/README.md b/README.md index 834d6221..cbecc606 100644 --- a/README.md +++ b/README.md @@ -101,8 +101,6 @@ Known performance related issues: * Severe performance issues with Beignet v1.3.0 due to missing support for local memory. Please downgrade to v1.2.1 or upgrade to v1.3.1 or newer. -* Performance issues on Qualcomm Adreno GPUs. - Other known issues: * Routines returning an integer are currently not properly tested for half-precision FP16: IHAMAX/IHAMIN/IHMAX/IHMIN diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 3600b9d2..06a6773b 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -32,7 +32,7 @@ R"( // The main reduction kernel, performing the loading and the majority of the operation #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) #endif void Xamax(const int n, @@ -102,7 +102,7 @@ void Xamax(const int n, // be launched with a single workgroup only. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) #endif void XamaxEpilogue(const __global singlereal* restrict maxgm, diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl index 875221f4..683c6fad 100644 --- a/src/kernels/level1/xasum.opencl +++ b/src/kernels/level1/xasum.opencl @@ -32,7 +32,7 @@ R"( // The main reduction kernel, performing the loading and the majority of the operation #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) #endif void Xasum(const int n, @@ -79,7 +79,7 @@ void Xasum(const int n, // be launched with a single workgroup only. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) #endif void XasumEpilogue(const __global real* restrict input, diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index b20ad200..a106ed01 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -24,7 +24,7 @@ R"( // Full version of the kernel with offsets and strided accesses #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void Xaxpy(const int n, const real_arg arg_alpha, @@ -43,7 +43,7 @@ void Xaxpy(const int n, const real_arg arg_alpha, // assumes that 'n' is dividable by 'VW' and 'WPT'. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XaxpyFaster(const int n, const real_arg arg_alpha, @@ -67,7 +67,7 @@ void XaxpyFaster(const int n, const real_arg arg_alpha, // dividable by 'VW', 'WGS' and 'WPT'. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XaxpyFastest(const int n, const real_arg arg_alpha, @@ -89,7 +89,7 @@ void XaxpyFastest(const int n, const real_arg arg_alpha, // Full version of the kernel with offsets and strided accesses: batched version #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XaxpyBatched(const int n, const __constant real_arg* arg_alphas, diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl index 174bf0c6..493197af 100644 --- a/src/kernels/level1/xcopy.opencl +++ b/src/kernels/level1/xcopy.opencl @@ -24,7 +24,7 @@ R"( // Full version of the kernel with offsets and strided accesses #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void Xcopy(const int n, @@ -43,7 +43,7 @@ void Xcopy(const int n, // dividable by 'VW', 'WGS' and 'WPT'. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XcopyFast(const int n, diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl index e14b6306..64f6eb9d 100644 --- a/src/kernels/level1/xdot.opencl +++ b/src/kernels/level1/xdot.opencl @@ -32,7 +32,7 @@ R"( // The main reduction kernel, performing the multiplication and the majority of the sum operation #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) #endif void Xdot(const int n, @@ -78,7 +78,7 @@ void Xdot(const int n, // be launched with a single workgroup only. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) #endif void XdotEpilogue(const __global real* restrict input, diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl index aee98f91..47bb5170 100644 --- a/src/kernels/level1/xhad.opencl +++ b/src/kernels/level1/xhad.opencl @@ -68,7 +68,7 @@ INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV // Full version of the kernel with offsets and strided accesses #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta, @@ -96,7 +96,7 @@ void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta, // assumes that 'n' is dividable by 'VW' and 'WPT'. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta, @@ -127,7 +127,7 @@ void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta, // dividable by 'VW', 'WGS' and 'WPT'. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta, diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl index fb45effb..36ea49b4 100644 --- a/src/kernels/level1/xnrm2.opencl +++ b/src/kernels/level1/xnrm2.opencl @@ -32,7 +32,7 @@ R"( // The main reduction kernel, performing the multiplication and the majority of the operation #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) #endif void Xnrm2(const int n, @@ -77,7 +77,7 @@ void Xnrm2(const int n, // be launched with a single workgroup only. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) #endif void Xnrm2Epilogue(const __global real* restrict input, diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl index 19ca9135..e4260c7c 100644 --- a/src/kernels/level1/xscal.opencl +++ b/src/kernels/level1/xscal.opencl @@ -24,7 +24,7 @@ R"( // Full version of the kernel with offsets and strided accesses #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void Xscal(const int n, const real_arg arg_alpha, @@ -46,7 +46,7 @@ void Xscal(const int n, const real_arg arg_alpha, // dividable by 'VW', 'WGS' and 'WPT'. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XscalFast(const int n, const real_arg arg_alpha, diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl index a2b44de3..2d384423 100644 --- a/src/kernels/level1/xswap.opencl +++ b/src/kernels/level1/xswap.opencl @@ -24,7 +24,7 @@ R"( // Full version of the kernel with offsets and strided accesses #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void Xswap(const int n, @@ -45,7 +45,7 @@ void Xswap(const int n, // dividable by 'VW', 'WGS' and 'WPT'. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) #endif void XswapFast(const int n, diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl index 16711aa4..15912a60 100644 --- a/src/kernels/level2/xgemv.opencl +++ b/src/kernels/level2/xgemv.opencl @@ -212,7 +212,7 @@ INLINE_FUNC real LoadMatrixA(const __global real* restrict agm, const int x, con // Full version of the kernel #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) #endif void Xgemv(const int m, const int n, diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl index 853d3d28..46087af7 100644 --- a/src/kernels/level2/xgemv_fast.opencl +++ b/src/kernels/level2/xgemv_fast.opencl @@ -90,7 +90,7 @@ INLINE_FUNC realVF LoadMatrixAVF(const __global realVF* restrict agm, const int // --> 'do_conjugate' is 0 #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) #endif void XgemvFast(const int m, const int n, @@ -197,7 +197,7 @@ void XgemvFast(const int m, const int n, // --> 'do_conjugate' is 0 #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS3, 1, 1))) #endif void XgemvFastRot(const int m, const int n, diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl index 3620c66a..b1fe8447 100644 --- a/src/kernels/level2/xger.opencl +++ b/src/kernels/level2/xger.opencl @@ -20,7 +20,7 @@ R"( // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC) #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) #endif void Xger(const int max1, const int max2, diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl index 9e460cd4..eac8c10a 100644 --- a/src/kernels/level2/xher.opencl +++ b/src/kernels/level2/xher.opencl @@ -20,7 +20,7 @@ R"( // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR) #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) #endif void Xher(const int n, diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl index c3e85c15..9e7f3c6c 100644 --- a/src/kernels/level2/xher2.opencl +++ b/src/kernels/level2/xher2.opencl @@ -20,7 +20,7 @@ R"( // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2) #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) #endif void Xher2(const int n, diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl index e3b5418c..7677377e 100644 --- a/src/kernels/level2/xtrsv.opencl +++ b/src/kernels/level2/xtrsv.opencl @@ -41,7 +41,7 @@ void FillVector(const int n, const int inc, const int offset, #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) #endif void trsv_forward(int n, @@ -93,7 +93,7 @@ void trsv_forward(int n, #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) #endif void trsv_backward(int n, diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl index b69be23d..4bb61f4e 100644 --- a/src/kernels/level3/convert_hermitian.opencl +++ b/src/kernels/level3/convert_hermitian.opencl @@ -23,7 +23,7 @@ R"( // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void HermLowerToSquared(const int src_dim, @@ -66,7 +66,7 @@ void HermLowerToSquared(const int src_dim, // Same as above, but now the matrix' data is stored in the upper-triangle #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void HermUpperToSquared(const int src_dim, diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl index 2ce17f40..264bf9c5 100644 --- a/src/kernels/level3/convert_symmetric.opencl +++ b/src/kernels/level3/convert_symmetric.opencl @@ -22,7 +22,7 @@ R"( // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void SymmLowerToSquared(const int src_dim, @@ -59,7 +59,7 @@ void SymmLowerToSquared(const int src_dim, // Same as above, but now the matrix' data is stored in the upper-triangle #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void SymmUpperToSquared(const int src_dim, diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl index 563f719f..092e1cf2 100644 --- a/src/kernels/level3/convert_triangular.opencl +++ b/src/kernels/level3/convert_triangular.opencl @@ -22,7 +22,7 @@ R"( // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void TriaLowerToSquared(const int src_dim, @@ -61,7 +61,7 @@ void TriaLowerToSquared(const int src_dim, // Same as above, but now the matrix' data is stored in the upper-triangle #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void TriaUpperToSquared(const int src_dim, diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl index e1a815f5..04e4b2ab 100644 --- a/src/kernels/level3/copy_fast.opencl +++ b/src/kernels/level3/copy_fast.opencl @@ -37,7 +37,7 @@ R"( // COPY_VW. Also requires both matrices to be of the same dimensions and without offset. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) #endif void CopyMatrixFast(const int ld, diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl index 6335bd81..dca93b76 100644 --- a/src/kernels/level3/copy_pad.opencl +++ b/src/kernels/level3/copy_pad.opencl @@ -61,7 +61,7 @@ INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two, // Interface to the above function #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyPadMatrix(const int src_one, const int src_two, @@ -124,7 +124,7 @@ INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two, // Interface to the above function #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyMatrix(const int src_one, const int src_two, @@ -148,7 +148,7 @@ void CopyMatrix(const int src_one, const int src_two, // Batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyPadMatrixBatched(const int src_one, const int src_two, @@ -170,7 +170,7 @@ void CopyPadMatrixBatched(const int src_one, const int src_two, // Batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyMatrixBatched(const int src_one, const int src_two, @@ -195,7 +195,7 @@ void CopyMatrixBatched(const int src_one, const int src_two, // Strided-batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyPadMatrixStridedBatched(const int src_one, const int src_two, @@ -217,7 +217,7 @@ void CopyPadMatrixStridedBatched(const int src_one, const int src_two, // Strided-batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) #endif void CopyMatrixStridedBatched(const int src_one, const int src_two, diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl index 3df477d1..580f7b8b 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl @@ -84,7 +84,7 @@ R"( // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1))) #endif void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld, diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl index e89984cc..9d1b7552 100644 --- a/src/kernels/level3/transpose_fast.opencl +++ b/src/kernels/level3/transpose_fast.opencl @@ -38,7 +38,7 @@ R"( // offset. A more general version is available in 'padtranspose.opencl'. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) #endif void TransposeMatrixFast(const int ld, diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index 31de0e62..3877a3d5 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -86,7 +86,7 @@ INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile, // Interface to the above function #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposePadMatrix(const int src_one, const int src_two, @@ -178,7 +178,7 @@ INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile, // Interface to the above function #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposeMatrix(const int src_one, const int src_two, @@ -203,7 +203,7 @@ void TransposeMatrix(const int src_one, const int src_two, // Batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposePadMatrixBatched(const int src_one, const int src_two, @@ -226,7 +226,7 @@ void TransposePadMatrixBatched(const int src_one, const int src_two, // Batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposeMatrixBatched(const int src_one, const int src_two, @@ -252,7 +252,7 @@ void TransposeMatrixBatched(const int src_one, const int src_two, // Strided-batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposePadMatrixStridedBatched(const int src_one, const int src_two, @@ -275,7 +275,7 @@ void TransposePadMatrixStridedBatched(const int src_one, const int src_two, // Strided-batched version of the above #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) #endif void TransposeMatrixStridedBatched(const int src_one, const int src_two, diff --git a/src/kernels/level3/xgemm_batched.opencl b/src/kernels/level3/xgemm_batched.opencl index e014b7a9..41d07d19 100644 --- a/src/kernels/level3/xgemm_batched.opencl +++ b/src/kernels/level3/xgemm_batched.opencl @@ -21,7 +21,7 @@ R"( #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) #endif void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK, @@ -68,7 +68,7 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK, #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) #endif void XgemmStridedBatched(const int kSizeM, const int kSizeN, const int kSizeK, diff --git a/src/kernels/level3/xgemm_direct_batched.opencl b/src/kernels/level3/xgemm_direct_batched.opencl index ec0b008b..102ae762 100644 --- a/src/kernels/level3/xgemm_direct_batched.opencl +++ b/src/kernels/level3/xgemm_direct_batched.opencl @@ -22,7 +22,7 @@ R"( // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, @@ -47,7 +47,7 @@ void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, @@ -72,7 +72,7 @@ void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, // Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, @@ -97,7 +97,7 @@ void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, // Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, @@ -126,7 +126,7 @@ void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, @@ -149,7 +149,7 @@ void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int k // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, @@ -172,7 +172,7 @@ void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int k // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, @@ -195,7 +195,7 @@ void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int k // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectStridedBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl index 66b880e6..5508170e 100644 --- a/src/kernels/level3/xgemm_direct_part3.opencl +++ b/src/kernels/level3/xgemm_direct_part3.opencl @@ -220,7 +220,7 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize // Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, @@ -239,7 +239,7 @@ void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, // Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, @@ -258,7 +258,7 @@ void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, // Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, @@ -277,7 +277,7 @@ void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, // Direct version of the GEMM kernel with [A, B] = [transposed, transposed] #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK, diff --git a/src/kernels/level3/xgemm_part4.opencl b/src/kernels/level3/xgemm_part4.opencl index a64e2efa..05524337 100644 --- a/src/kernels/level3/xgemm_part4.opencl +++ b/src/kernels/level3/xgemm_part4.opencl @@ -21,7 +21,7 @@ R"( // Main entry point of the kernel. This is the upper-triangular version. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) #endif void XgemmUpper(const int kSizeN, const int kSizeK, @@ -61,7 +61,7 @@ void XgemmUpper(const int kSizeN, const int kSizeK, // Main entry point of the kernel. This is the lower-triangular version. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) #endif void XgemmLower(const int kSizeN, const int kSizeK, @@ -105,7 +105,7 @@ void XgemmLower(const int kSizeN, const int kSizeK, // Main entry point of the kernel. This is the regular full version. #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) #endif void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, diff --git a/src/kernels/levelx/col2im.opencl b/src/kernels/levelx/col2im.opencl index fcc307c6..ab0ffbfa 100644 --- a/src/kernels/levelx/col2im.opencl +++ b/src/kernels/levelx/col2im.opencl @@ -94,7 +94,7 @@ INLINE_FUNC void Xcol2im(const int input_h, const int input_w, const int channel // Kernel flip version of the Xcol2im kernel (for convolution) #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) #endif void Xcol2imKernelFlip(const int input_h, const int input_w, const int channels, @@ -119,7 +119,7 @@ void Xcol2imKernelFlip(const int input_h, const int input_w, const int channels, // Normal version of the Xcol2im kernel (for cross-correlation) #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) #endif void Xcol2imKernelNormal(const int input_h, const int input_w, const int channels, diff --git a/src/kernels/levelx/im2col.opencl b/src/kernels/levelx/im2col.opencl index 8324468e..59af38fc 100644 --- a/src/kernels/levelx/im2col.opencl +++ b/src/kernels/levelx/im2col.opencl @@ -76,7 +76,7 @@ INLINE_FUNC void Xim2col(const int input_h, const int input_w, const int channel // Kernel flip version of the Xim2col kernel (for convolution) #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) #endif void Xim2colKernelFlip(const int input_h, const int input_w, const int channels, @@ -97,7 +97,7 @@ void Xim2colKernelFlip(const int input_h, const int input_w, const int channels, // Normal version of the Xim2col kernel (for cross-correlation) #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) #endif void Xim2colKernelNormal(const int input_h, const int input_w, const int channels, diff --git a/src/kernels/levelx/xconvgemm_part2.opencl b/src/kernels/levelx/xconvgemm_part2.opencl index 79c40f59..38ddd7eb 100644 --- a/src/kernels/levelx/xconvgemm_part2.opencl +++ b/src/kernels/levelx/xconvgemm_part2.opencl @@ -25,7 +25,7 @@ R"( #if defined(CONVGEMM_WITH_IM2COL) #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void Xconvgemm(const int num_patches, const int num_kernels, const int patch_size, @@ -291,7 +291,7 @@ INLINE_FUNC void Xconvgemm(const int num_patches, const int num_kernels, const i #if !defined(CONVGEMM_WITH_IM2COL) #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch_size, @@ -316,7 +316,7 @@ void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch #if RELAX_WORKGROUP_SIZE == 1 __kernel -#elif +#else __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) #endif void XconvgemmNormal(const int num_patches, const int num_kernels, const int patch_size, diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index 7170c30a..59aa6107 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -43,7 +43,7 @@ std::shared_ptr CompileFromSource( // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve // performance, but might result in a reduced accuracy. - if ((device.IsAMD() && device.IsGPU()) || device.IsQualcomm()) { + if ((device.IsAMD() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) { header_string += "#define USE_CL_MAD 1\n"; } @@ -54,7 +54,7 @@ std::shared_ptr CompileFromSource( // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize // performance through better cache behaviour - if ((device.IsARM() && device.IsGPU()) || device.IsQualcomm()) { + if ((device.IsARM() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) { header_string += "#define GLOBAL_MEM_FENCE 1\n"; } diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index 32de2e2e..fbdcf9c2 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -463,7 +463,7 @@ std::string GetDeviceArchitecture(const Device& device) { else if (device.HasExtension(kKhronosAttributesAMD)) { device_architecture = device.Name(); // Name is architecture for AMD APP and AMD ROCm } - else if (device.IsQualcomm()) { // queries the Adreno GPU architecture version + else if ((device.IsQualcomm() && device.IsGPU())) { // queries the Adreno GPU architecture version device_architecture = device.AdrenoVersion(); } // Note: no else - 'device_architecture' might be the empty string -- cgit v1.2.3