From 4f394608a28f419dfd6091c704148d9e638a26f0 Mon Sep 17 00:00:00 2001 From: "Angus, Alexander" Date: Tue, 3 Jan 2023 10:56:04 -0800 Subject: implemented changes to boost Adreno performance according to https://jira-dc.qualcomm.com/jira/browse/OSR-8731 --- src/clpp11.hpp | 8 ++++ src/database/kernels/copy/copy_32.hpp | 1 + src/kernel_preprocessor.cpp | 19 +++++++++ src/kernels/common.opencl | 6 +++ src/kernels/level1/xamax.opencl | 12 +++++- src/kernels/level1/xasum.opencl | 12 +++++- src/kernels/level1/xaxpy.opencl | 24 +++++++++-- src/kernels/level1/xcopy.opencl | 12 +++++- src/kernels/level1/xdot.opencl | 12 +++++- src/kernels/level1/xhad.opencl | 18 ++++++-- src/kernels/level1/xnrm2.opencl | 12 +++++- src/kernels/level1/xscal.opencl | 12 +++++- src/kernels/level1/xswap.opencl | 12 +++++- src/kernels/level2/xgemv.opencl | 6 ++- src/kernels/level2/xgemv_fast.opencl | 12 +++++- src/kernels/level2/xger.opencl | 6 ++- src/kernels/level2/xher.opencl | 6 ++- src/kernels/level2/xher2.opencl | 6 ++- src/kernels/level2/xtrsv.opencl | 12 +++++- src/kernels/level3/convert_hermitian.opencl | 12 +++++- src/kernels/level3/convert_symmetric.opencl | 12 +++++- src/kernels/level3/convert_triangular.opencl | 12 +++++- src/kernels/level3/copy_fast.opencl | 6 ++- src/kernels/level3/copy_pad.opencl | 36 +++++++++++++--- .../level3/invert_diagonal_blocks_part1.opencl | 6 ++- src/kernels/level3/transpose_fast.opencl | 6 ++- src/kernels/level3/transpose_pad.opencl | 36 +++++++++++++--- src/kernels/level3/xgemm_batched.opencl | 12 +++++- src/kernels/level3/xgemm_direct_batched.opencl | 48 ++++++++++++++++++---- src/kernels/level3/xgemm_direct_part3.opencl | 24 +++++++++-- src/kernels/level3/xgemm_part4.opencl | 18 ++++++-- src/kernels/levelx/col2im.opencl | 12 +++++- src/kernels/levelx/im2col.opencl | 12 +++++- src/kernels/levelx/xconvgemm_part2.opencl | 18 ++++++-- src/utilities/compile.cpp | 12 ++++-- src/utilities/utilities.cpp | 3 ++ 36 files changed, 416 insertions(+), 77 deletions(-) diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 2a25606c..e5b8b4a7 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -365,6 +365,14 @@ class Device { return false; } + // Returns the Qualcomm Adreno GPU version (i.e. a650, a730, a740, etc.) + std::string AdrenoVersion() const { + if (IsQualcomm()) { + return GetInfoString(CL_DEVICE_OPENCL_C_VERSION); + } + else { return std::string{""}; } + } + // Retrieves the above extra information (if present) std::string GetExtraInfo() const { if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); } diff --git a/src/database/kernels/copy/copy_32.hpp b/src/database/kernels/copy/copy_32.hpp index f5ac8e30..f7ae1edb 100644 --- a/src/database/kernels/copy/copy_32.hpp +++ b/src/database/kernels/copy/copy_32.hpp @@ -135,6 +135,7 @@ const DatabaseEntry CopySingle = { { Name{"GeForce GTX 670 "}, Params{ 16, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 680 "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { Name{"GeForce GTX 760 Ti OEM "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { Name{"Quadro K600 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { kDeviceNameDefault , Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, } }, { "SM3.5", { diff --git a/src/kernel_preprocessor.cpp b/src/kernel_preprocessor.cpp index abe0cd76..7271237a 100644 --- a/src/kernel_preprocessor.cpp +++ b/src/kernel_preprocessor.cpp @@ -371,6 +371,25 @@ std::vector PreprocessDefinesAndComments(const std::string& source, defines_string.emplace(name, value); } + // Detect #undef macros + // When USE_SUBGROUP_SHUFFLING is set, but kernel parameters do not satisfy the conditions + // for subgroup shuffle, USE_SUBGROUP_SHUFFLING needs to be unset in preprocessing + // to avoid GEMM kernel errors. See src/kernels/level3/xgemm_part1.opencl line 142. + // In this preprocessor, macros are not redefined because of behavior defined by std::map::emplace + const auto undef_pos = line.find("#undef "); + if (undef_pos != std::string::npos) { + const auto undef = line.substr(undef_pos + 7); // length of "#undef " + // checks if definition is found in defines_int and/or defines_string, then removes the definition + auto int_undef = defines_int.find(undef); + if (int_undef != defines_int.end()){ + defines_int.erase(int_undef); + } + auto string_undef = defines_string.find(undef); + if (string_undef != defines_string.end()){ + defines_string.erase(string_undef); + } + } + // Detect #ifndef blocks const auto ifndef_pos = line.find("#ifndef "); if (ifndef_pos != std::string::npos) { diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 56c1dae4..0ec741ad 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -132,6 +132,12 @@ R"( #define USE_CL_MAD 0 #endif +// By default the workgroup size requirement is enabled. For Qualcomm devices the workgroup size +// requirement results in worse performance and is disabled (src/utilities/compile.cpp) +#ifndef RELAX_WORKGROUP_SIZE + #define RELAX_WORKGROUP_SIZE 0 +#endif + // Sets a variable to zero #if PRECISION == 3232 || PRECISION == 6464 #define SetToZero(a) a.x = ZERO; a.y = ZERO diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 85cbdc86..3600b9d2 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -30,7 +30,11 @@ R"( // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation -__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#endif void Xamax(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global singlereal* maxgm, __global unsigned int* imaxgm) { @@ -96,7 +100,11 @@ void Xamax(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#endif void XamaxEpilogue(const __global singlereal* restrict maxgm, const __global unsigned int* restrict imaxgm, __global unsigned int* imax, const int imax_offset) { diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl index 42e49d4c..875221f4 100644 --- a/src/kernels/level1/xasum.opencl +++ b/src/kernels/level1/xasum.opencl @@ -30,7 +30,11 @@ R"( // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation -__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#endif void Xasum(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* output) { @@ -73,7 +77,11 @@ void Xasum(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#endif void XasumEpilogue(const __global real* restrict input, __global real* asum, const int asum_offset) { __local real lm[WGS2]; diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index 772b57f3..b20ad200 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -22,7 +22,11 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void Xaxpy(const int n, const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { @@ -37,7 +41,11 @@ void Xaxpy(const int n, const real_arg arg_alpha, // Faster version of the kernel without offsets and strided accesses but with if-statement. Also // assumes that 'n' is dividable by 'VW' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XaxpyFaster(const int n, const real_arg arg_alpha, const __global realV* restrict xgm, __global realV* ygm) { @@ -57,7 +65,11 @@ void XaxpyFaster(const int n, const real_arg arg_alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XaxpyFastest(const int n, const real_arg arg_alpha, const __global realV* restrict xgm, __global realV* ygm) { @@ -75,7 +87,11 @@ void XaxpyFastest(const int n, const real_arg arg_alpha, // ================================================================================================= // Full version of the kernel with offsets and strided accesses: batched version -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XaxpyBatched(const int n, const __constant real_arg* arg_alphas, const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc, __global real* ygm, const __constant int* y_offsets, const int y_inc) { diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl index aed80fc2..174bf0c6 100644 --- a/src/kernels/level1/xcopy.opencl +++ b/src/kernels/level1/xcopy.opencl @@ -22,7 +22,11 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void Xcopy(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { @@ -37,7 +41,11 @@ void Xcopy(const int n, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XcopyFast(const int n, const __global realV* restrict xgm, __global realV* ygm) { diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl index 1a703d96..e14b6306 100644 --- a/src/kernels/level1/xdot.opencl +++ b/src/kernels/level1/xdot.opencl @@ -30,7 +30,11 @@ R"( // ================================================================================================= // The main reduction kernel, performing the multiplication and the majority of the sum operation -__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#endif void Xdot(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, @@ -72,7 +76,11 @@ void Xdot(const int n, // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to // be launched with a single workgroup only. -__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#endif void XdotEpilogue(const __global real* restrict input, __global real* dot, const int dot_offset) { __local real lm[WGS2]; diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl index 24e0c76c..aee98f91 100644 --- a/src/kernels/level1/xhad.opencl +++ b/src/kernels/level1/xhad.opencl @@ -66,7 +66,11 @@ INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, @@ -90,7 +94,11 @@ void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta, // Faster version of the kernel without offsets and strided accesses but with if-statement. Also // assumes that 'n' is dividable by 'VW' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta, const __global realV* restrict xgm, const __global realV* restrict ygm, __global realV* zgm) { @@ -117,7 +125,11 @@ void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta, const __global realV* restrict xgm, const __global realV* restrict ygm, __global realV* zgm) { diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl index 6a81c150..fb45effb 100644 --- a/src/kernels/level1/xnrm2.opencl +++ b/src/kernels/level1/xnrm2.opencl @@ -30,7 +30,11 @@ R"( // ================================================================================================= // The main reduction kernel, performing the multiplication and the majority of the operation -__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#endif void Xnrm2(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* output) { @@ -71,7 +75,11 @@ void Xnrm2(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#endif void Xnrm2Epilogue(const __global real* restrict input, __global real* nrm2, const int nrm2_offset) { __local real lm[WGS2]; diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl index cb133e88..19ca9135 100644 --- a/src/kernels/level1/xscal.opencl +++ b/src/kernels/level1/xscal.opencl @@ -22,7 +22,11 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void Xscal(const int n, const real_arg arg_alpha, __global real* xgm, const int x_offset, const int x_inc) { const real alpha = GetRealArg(arg_alpha); @@ -40,7 +44,11 @@ void Xscal(const int n, const real_arg arg_alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XscalFast(const int n, const real_arg arg_alpha, __global realV* xgm) { const real alpha = GetRealArg(arg_alpha); diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl index bf5b6194..a2b44de3 100644 --- a/src/kernels/level1/xswap.opencl +++ b/src/kernels/level1/xswap.opencl @@ -22,7 +22,11 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void Xswap(const int n, __global real* xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { @@ -39,7 +43,11 @@ void Xswap(const int n, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XswapFast(const int n, __global realV* xgm, __global realV* ygm) { diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl index ba29aba6..16711aa4 100644 --- a/src/kernels/level2/xgemv.opencl +++ b/src/kernels/level2/xgemv.opencl @@ -210,7 +210,11 @@ INLINE_FUNC real LoadMatrixA(const __global real* restrict agm, const int x, con // ================================================================================================= // Full version of the kernel -__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#endif void Xgemv(const int m, const int n, const real_arg arg_alpha, const real_arg arg_beta, diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl index 45ceb36c..853d3d28 100644 --- a/src/kernels/level2/xgemv_fast.opencl +++ b/src/kernels/level2/xgemv_fast.opencl @@ -88,7 +88,11 @@ INLINE_FUNC realVF LoadMatrixAVF(const __global realVF* restrict agm, const int // --> 'a_ld' is a multiple of VW2 // --> 'a_rotated' is 0 // --> 'do_conjugate' is 0 -__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#endif void XgemvFast(const int m, const int n, const real_arg arg_alpha, const real_arg arg_beta, @@ -191,7 +195,11 @@ void XgemvFast(const int m, const int n, // --> 'a_ld' is a multiple of VW3 // --> 'a_rotated' is 1 // --> 'do_conjugate' is 0 -__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS3, 1, 1))) +#endif void XgemvFastRot(const int m, const int n, const real_arg arg_alpha, const real_arg arg_beta, diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl index ca6071cd..3620c66a 100644 --- a/src/kernels/level2/xger.opencl +++ b/src/kernels/level2/xger.opencl @@ -18,7 +18,11 @@ R"( // ================================================================================================= // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC) -__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +#endif void Xger(const int max1, const int max2, const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl index 8a57bdfc..9e460cd4 100644 --- a/src/kernels/level2/xher.opencl +++ b/src/kernels/level2/xher.opencl @@ -18,7 +18,11 @@ R"( // ================================================================================================= // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR) -__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +#endif void Xher(const int n, const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl index 73305149..c3e85c15 100644 --- a/src/kernels/level2/xher2.opencl +++ b/src/kernels/level2/xher2.opencl @@ -18,7 +18,11 @@ R"( // ================================================================================================= // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2) -__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) +#endif void Xher2(const int n, const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl index e7b6ae79..e3b5418c 100644 --- a/src/kernels/level2/xtrsv.opencl +++ b/src/kernels/level2/xtrsv.opencl @@ -39,7 +39,11 @@ void FillVector(const int n, const int inc, const int offset, // ================================================================================================= -__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) +#endif void trsv_forward(int n, const __global real *A, const int a_offset, int a_ld, __global real *b, const int b_offset, int b_inc, @@ -87,7 +91,11 @@ void trsv_forward(int n, } } -__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) +#endif void trsv_backward(int n, const __global real *A, const int a_offset, int a_ld, __global real *b, const int b_offset, int b_inc, diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl index 0e89b78b..b69be23d 100644 --- a/src/kernels/level3/convert_hermitian.opencl +++ b/src/kernels/level3/convert_hermitian.opencl @@ -21,7 +21,11 @@ R"( // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void HermLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, @@ -60,7 +64,11 @@ void HermLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void HermUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl index 83ecdd65..2ce17f40 100644 --- a/src/kernels/level3/convert_symmetric.opencl +++ b/src/kernels/level3/convert_symmetric.opencl @@ -20,7 +20,11 @@ R"( // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void SymmLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, @@ -53,7 +57,11 @@ void SymmLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void SymmUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl index a9d5e769..563f719f 100644 --- a/src/kernels/level3/convert_triangular.opencl +++ b/src/kernels/level3/convert_triangular.opencl @@ -20,7 +20,11 @@ R"( // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void TriaLowerToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, @@ -55,7 +59,11 @@ void TriaLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void TriaUpperToSquared(const int src_dim, const int src_ld, const int src_offset, __global const real* restrict src, diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl index ef8a9017..e1a815f5 100644 --- a/src/kernels/level3/copy_fast.opencl +++ b/src/kernels/level3/copy_fast.opencl @@ -35,7 +35,11 @@ R"( // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of // COPY_VW. Also requires both matrices to be of the same dimensions and without offset. -__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#endif void CopyMatrixFast(const int ld, __global const realC* restrict src, __global realC* dest, diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl index 3d389b74..6335bd81 100644 --- a/src/kernels/level3/copy_pad.opencl +++ b/src/kernels/level3/copy_pad.opencl @@ -59,7 +59,11 @@ INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two, } // Interface to the above function -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyPadMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, @@ -118,7 +122,11 @@ INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two, } // Interface to the above function -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, @@ -138,7 +146,11 @@ void CopyMatrix(const int src_one, const int src_two, #if defined(ROUTINE_GEMMBATCHED) // Batched version of the above -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyPadMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, @@ -156,7 +168,11 @@ void CopyPadMatrixBatched(const int src_one, const int src_two, } // Batched version of the above -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, @@ -177,7 +193,11 @@ void CopyMatrixBatched(const int src_one, const int src_two, #if defined(ROUTINE_GEMMSTRIDEDBATCHED) // Strided-batched version of the above -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyPadMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, @@ -195,7 +215,11 @@ void CopyPadMatrixStridedBatched(const int src_one, const int src_two, } // Strided-batched version of the above -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +#endif void CopyMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl index c1f96bd7..3df477d1 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl @@ -82,7 +82,11 @@ R"( // ================================================================================================= // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix -__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1))) +#endif void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld, __global real* restrict dest, const int outer_block_size, const int unit_diagonal, const int is_upper) diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl index 1b9fca45..e89984cc 100644 --- a/src/kernels/level3/transpose_fast.opencl +++ b/src/kernels/level3/transpose_fast.opencl @@ -36,7 +36,11 @@ R"( // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without // offset. A more general version is available in 'padtranspose.opencl'. -__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +#endif void TransposeMatrixFast(const int ld, __global const realT* restrict src, __global realT* dest, diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index e55a8b7c..31de0e62 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -84,7 +84,11 @@ INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile, } // Interface to the above function -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposePadMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, @@ -172,7 +176,11 @@ INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile, } // Interface to the above function -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposeMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, @@ -193,7 +201,11 @@ void TransposeMatrix(const int src_one, const int src_two, #if defined(ROUTINE_GEMMBATCHED) // Batched version of the above -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposePadMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, @@ -212,7 +224,11 @@ void TransposePadMatrixBatched(const int src_one, const int src_two, } // Batched version of the above -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposeMatrixBatched(const int src_one, const int src_two, const int src_ld, const __constant int* src_offsets, __global const real* restrict src, @@ -234,7 +250,11 @@ void TransposeMatrixBatched(const int src_one, const int src_two, #if defined(ROUTINE_GEMMSTRIDEDBATCHED) // Strided-batched version of the above -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposePadMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, @@ -253,7 +273,11 @@ void TransposePadMatrixStridedBatched(const int src_one, const int src_two, } // Strided-batched version of the above -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +#endif void TransposeMatrixStridedBatched(const int src_one, const int src_two, const int src_ld, const int src_offset, const int src_stride, __global const real* restrict src, diff --git a/src/kernels/level3/xgemm_batched.opencl b/src/kernels/level3/xgemm_batched.opencl index b51e6298..e014b7a9 100644 --- a/src/kernels/level3/xgemm_batched.opencl +++ b/src/kernels/level3/xgemm_batched.opencl @@ -19,7 +19,11 @@ R"( // ================================================================================================= #if defined(ROUTINE_GEMMBATCHED) -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#endif void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, @@ -62,7 +66,11 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK, // ================================================================================================= #if defined(ROUTINE_GEMMSTRIDEDBATCHED) -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#endif void XgemmStridedBatched(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realM* restrict agm, const int a_one, const int a_two, diff --git a/src/kernels/level3/xgemm_direct_batched.opencl b/src/kernels/level3/xgemm_direct_batched.opencl index d15ed31e..ec0b008b 100644 --- a/src/kernels/level3/xgemm_direct_batched.opencl +++ b/src/kernels/level3/xgemm_direct_batched.opencl @@ -20,7 +20,11 @@ R"( #if defined(ROUTINE_GEMMBATCHED) // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, @@ -41,7 +45,11 @@ void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, @@ -62,7 +70,11 @@ void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, @@ -83,7 +95,11 @@ void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, @@ -108,7 +124,11 @@ void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, #if defined(ROUTINE_GEMMSTRIDEDBATCHED) // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, @@ -127,7 +147,11 @@ void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int k } // Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, @@ -146,7 +170,11 @@ void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int k } // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, @@ -165,7 +193,11 @@ void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int k } // Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectStridedBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride, diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl index 0822c95f..66b880e6 100644 --- a/src/kernels/level3/xgemm_direct_part3.opencl +++ b/src/kernels/level3/xgemm_direct_part3.opencl @@ -218,7 +218,11 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize // ================================================================================================= // Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, @@ -233,7 +237,11 @@ void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, @@ -248,7 +256,11 @@ void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, @@ -263,7 +275,11 @@ void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, } // Direct version of the GEMM kernel with [A, B] = [transposed, transposed] -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, const __global realMD* restrict agm, const int a_offset, const int a_ld, diff --git a/src/kernels/level3/xgemm_part4.opencl b/src/kernels/level3/xgemm_part4.opencl index b1f1ade6..a64e2efa 100644 --- a/src/kernels/level3/xgemm_part4.opencl +++ b/src/kernels/level3/xgemm_part4.opencl @@ -19,7 +19,11 @@ R"( #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) // Main entry point of the kernel. This is the upper-triangular version. -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#endif void XgemmUpper(const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, @@ -55,7 +59,11 @@ void XgemmUpper(const int kSizeN, const int kSizeK, } // Main entry point of the kernel. This is the lower-triangular version. -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#endif void XgemmLower(const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, @@ -95,7 +103,11 @@ void XgemmLower(const int kSizeN, const int kSizeK, #else // Main entry point of the kernel. This is the regular full version. -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +#endif void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, diff --git a/src/kernels/levelx/col2im.opencl b/src/kernels/levelx/col2im.opencl index 425ffbbc..fcc307c6 100644 --- a/src/kernels/levelx/col2im.opencl +++ b/src/kernels/levelx/col2im.opencl @@ -92,7 +92,11 @@ INLINE_FUNC void Xcol2im(const int input_h, const int input_w, const int channel // ================================================================================================= // Kernel flip version of the Xcol2im kernel (for convolution) -__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#endif void Xcol2imKernelFlip(const int input_h, const int input_w, const int channels, const int output_h, const int output_w, const int kernel_h, const int kernel_w, @@ -113,7 +117,11 @@ void Xcol2imKernelFlip(const int input_h, const int input_w, const int channels, } // Normal version of the Xcol2im kernel (for cross-correlation) -__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#endif void Xcol2imKernelNormal(const int input_h, const int input_w, const int channels, const int output_h, const int output_w, const int kernel_h, const int kernel_w, diff --git a/src/kernels/levelx/im2col.opencl b/src/kernels/levelx/im2col.opencl index 5db4cb5f..8324468e 100644 --- a/src/kernels/levelx/im2col.opencl +++ b/src/kernels/levelx/im2col.opencl @@ -74,7 +74,11 @@ INLINE_FUNC void Xim2col(const int input_h, const int input_w, const int channel // ================================================================================================= // Kernel flip version of the Xim2col kernel (for convolution) -__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#endif void Xim2colKernelFlip(const int input_h, const int input_w, const int channels, const int output_h, const int output_w, const int kernel_h, const int kernel_w, @@ -91,7 +95,11 @@ void Xim2colKernelFlip(const int input_h, const int input_w, const int channels, } // Normal version of the Xim2col kernel (for cross-correlation) -__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +#endif void Xim2colKernelNormal(const int input_h, const int input_w, const int channels, const int output_h, const int output_w, const int kernel_h, const int kernel_w, diff --git a/src/kernels/levelx/xconvgemm_part2.opencl b/src/kernels/levelx/xconvgemm_part2.opencl index 693cb120..79c40f59 100644 --- a/src/kernels/levelx/xconvgemm_part2.opencl +++ b/src/kernels/levelx/xconvgemm_part2.opencl @@ -23,7 +23,11 @@ R"( // ConvGEMM kernel #if defined(CONVGEMM_WITH_IM2COL) -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void Xconvgemm(const int num_patches, const int num_kernels, const int patch_size, const __global realND* restrict kernelgm, const int kernel_offset, __global real* resultgm, const int result_offset, const int result_stride, @@ -285,7 +289,11 @@ INLINE_FUNC void Xconvgemm(const int num_patches, const int num_kernels, const i } #if !defined(CONVGEMM_WITH_IM2COL) -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch_size, const __global realND* restrict kernelgm, const int kernel_offset, __global real* resultgm, const int result_offset, const int result_stride, @@ -306,7 +314,11 @@ void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch output_h, output_w, alm, blm, kernel_flip); } -__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +#endif void XconvgemmNormal(const int num_patches, const int num_kernels, const int patch_size, const __global realND* restrict kernelgm, const int kernel_offset, __global real* resultgm, const int result_offset, const int result_stride, diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index aeb7a3e2..7170c30a 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -37,13 +37,13 @@ std::shared_ptr CompileFromSource( // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on // which it is known to work with all OpenCL platforms. - if (device.IsNVIDIA() || device.IsARM()) { + if (device.IsNVIDIA() || device.IsARM() || device.IsQualcomm()) { header_string += "#define USE_INLINE_KEYWORD 1\n"; } // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve // performance, but might result in a reduced accuracy. - if (device.IsAMD() && device.IsGPU()) { + if ((device.IsAMD() && device.IsGPU()) || device.IsQualcomm()) { header_string += "#define USE_CL_MAD 1\n"; } @@ -54,7 +54,7 @@ std::shared_ptr CompileFromSource( // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize // performance through better cache behaviour - if (device.IsARM() && device.IsGPU()) { + if ((device.IsARM() && device.IsGPU()) || device.IsQualcomm()) { header_string += "#define GLOBAL_MEM_FENCE 1\n"; } @@ -77,6 +77,12 @@ std::shared_ptr CompileFromSource( header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n"; } } + + // For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance. + // This option compiles without the workgroup size requirement and does not affect correctness. + if (device.IsQualcomm()) { + header_string += "#define RELAX_WORKGROUP_SIZE 1\n"; + } // Optionally adds a translation header from OpenCL kernels to CUDA kernels #ifdef CUDA_API diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index 29161e74..32de2e2e 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -463,6 +463,9 @@ std::string GetDeviceArchitecture(const Device& device) { else if (device.HasExtension(kKhronosAttributesAMD)) { device_architecture = device.Name(); // Name is architecture for AMD APP and AMD ROCm } + else if (device.IsQualcomm()) { // queries the Adreno GPU architecture version + device_architecture = device.AdrenoVersion(); + } // Note: no else - 'device_architecture' might be the empty string #endif -- cgit v1.2.3