summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2023-01-21 20:28:32 +0100
committerGitHub <noreply@github.com>2023-01-21 20:28:32 +0100
commite72f87ae5eca5e2ea8aea4f2ce49408c1faa0521 (patch)
tree588a426b6350a5c982d89d98749ae78667fd23b4
parent03cffa83c5f7742f8ec0c5e762bb7048e38952f3 (diff)
parent73f49e9b3d4abc4214122e4b8c07a736e01626ee (diff)
Merge pull request #451 from CodeLinaro/master
CLBlast modifications to address Qualcomm Adreno performance
-rw-r--r--.gitignore2
-rw-r--r--CHANGELOG5
-rw-r--r--README.md2
-rw-r--r--src/clpp11.hpp8
-rw-r--r--src/database/kernels/copy/copy_32.hpp1
-rw-r--r--src/kernel_preprocessor.cpp19
-rw-r--r--src/kernels/common.opencl6
-rw-r--r--src/kernels/level1/xamax.opencl12
-rw-r--r--src/kernels/level1/xasum.opencl12
-rw-r--r--src/kernels/level1/xaxpy.opencl24
-rw-r--r--src/kernels/level1/xcopy.opencl12
-rw-r--r--src/kernels/level1/xdot.opencl12
-rw-r--r--src/kernels/level1/xhad.opencl18
-rw-r--r--src/kernels/level1/xnrm2.opencl12
-rw-r--r--src/kernels/level1/xscal.opencl12
-rw-r--r--src/kernels/level1/xswap.opencl12
-rw-r--r--src/kernels/level2/xgemv.opencl6
-rw-r--r--src/kernels/level2/xgemv_fast.opencl12
-rw-r--r--src/kernels/level2/xger.opencl6
-rw-r--r--src/kernels/level2/xher.opencl6
-rw-r--r--src/kernels/level2/xher2.opencl6
-rw-r--r--src/kernels/level2/xtrsv.opencl12
-rw-r--r--src/kernels/level3/convert_hermitian.opencl12
-rw-r--r--src/kernels/level3/convert_symmetric.opencl12
-rw-r--r--src/kernels/level3/convert_triangular.opencl12
-rw-r--r--src/kernels/level3/copy_fast.opencl6
-rw-r--r--src/kernels/level3/copy_pad.opencl36
-rw-r--r--src/kernels/level3/invert_diagonal_blocks_part1.opencl6
-rw-r--r--src/kernels/level3/transpose_fast.opencl6
-rw-r--r--src/kernels/level3/transpose_pad.opencl36
-rw-r--r--src/kernels/level3/xgemm_batched.opencl12
-rw-r--r--src/kernels/level3/xgemm_direct_batched.opencl48
-rw-r--r--src/kernels/level3/xgemm_direct_part3.opencl24
-rw-r--r--src/kernels/level3/xgemm_part4.opencl18
-rw-r--r--src/kernels/levelx/col2im.opencl12
-rw-r--r--src/kernels/levelx/im2col.opencl12
-rw-r--r--src/kernels/levelx/xconvgemm_part2.opencl18
-rw-r--r--src/utilities/compile.cpp12
-rw-r--r--src/utilities/utilities.cpp3
39 files changed, 422 insertions, 80 deletions
diff --git a/.gitignore b/.gitignore
index ab80cec1..d77318d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,4 @@ database_best.json
cl.hpp
opencl.hpp
src/pyclblast/dist
-*.egg-info
+*.egg-info \ No newline at end of file
diff --git a/CHANGELOG b/CHANGELOG
index 7088fb49..824f4520 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,10 @@
Development version (next version)
- Fixes two small issues in the plotting script
+- Modifications to improve performance on Qualcomm Adreno GPUs:
+ * Unique database entries for specific Adreno devices
+ * Toggle OpenCL kernel compilation options for Adreno
+ * New preprocessor directive RELAX_WORKGROUP_SIZE
+- Fixed a bug in handling of #undef in CLBlast loop unrolling and array-to-register mapping functions
Version 1.5.3
- Fix a correctness issue with DGEMM on SM 7.5 Turing GPUs
diff --git a/README.md b/README.md
index 834d6221..cbecc606 100644
--- a/README.md
+++ b/README.md
@@ -101,8 +101,6 @@ Known performance related issues:
* Severe performance issues with Beignet v1.3.0 due to missing support for local memory. Please downgrade to v1.2.1 or upgrade to v1.3.1 or newer.
-* Performance issues on Qualcomm Adreno GPUs.
-
Other known issues:
* Routines returning an integer are currently not properly tested for half-precision FP16: IHAMAX/IHAMIN/IHMAX/IHMIN
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 2a25606c..e5b8b4a7 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -365,6 +365,14 @@ class Device {
return false;
}
+ // Returns the Qualcomm Adreno GPU version (i.e. a650, a730, a740, etc.)
+ std::string AdrenoVersion() const {
+ if (IsQualcomm()) {
+ return GetInfoString(CL_DEVICE_OPENCL_C_VERSION);
+ }
+ else { return std::string{""}; }
+ }
+
// Retrieves the above extra information (if present)
std::string GetExtraInfo() const {
if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); }
diff --git a/src/database/kernels/copy/copy_32.hpp b/src/database/kernels/copy/copy_32.hpp
index f5ac8e30..f7ae1edb 100644
--- a/src/database/kernels/copy/copy_32.hpp
+++ b/src/database/kernels/copy/copy_32.hpp
@@ -135,6 +135,7 @@ const DatabaseEntry CopySingle = {
{ Name{"GeForce GTX 670 "}, Params{ 16, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"GeForce GTX 680 "}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"GeForce GTX 760 Ti OEM "}, Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { Name{"Quadro K600 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
{ "SM3.5", {
diff --git a/src/kernel_preprocessor.cpp b/src/kernel_preprocessor.cpp
index abe0cd76..7271237a 100644
--- a/src/kernel_preprocessor.cpp
+++ b/src/kernel_preprocessor.cpp
@@ -371,6 +371,25 @@ std::vector<std::string> PreprocessDefinesAndComments(const std::string& source,
defines_string.emplace(name, value);
}
+ // Detect #undef macros
+ // When USE_SUBGROUP_SHUFFLING is set, but kernel parameters do not satisfy the conditions
+ // for subgroup shuffle, USE_SUBGROUP_SHUFFLING needs to be unset in preprocessing
+ // to avoid GEMM kernel errors. See src/kernels/level3/xgemm_part1.opencl line 142.
+ // In this preprocessor, macros are not redefined because of behavior defined by std::map::emplace
+ const auto undef_pos = line.find("#undef ");
+ if (undef_pos != std::string::npos) {
+ const auto undef = line.substr(undef_pos + 7); // length of "#undef "
+ // checks if definition is found in defines_int and/or defines_string, then removes the definition
+ auto int_undef = defines_int.find(undef);
+ if (int_undef != defines_int.end()){
+ defines_int.erase(int_undef);
+ }
+ auto string_undef = defines_string.find(undef);
+ if (string_undef != defines_string.end()){
+ defines_string.erase(string_undef);
+ }
+ }
+
// Detect #ifndef blocks
const auto ifndef_pos = line.find("#ifndef ");
if (ifndef_pos != std::string::npos) {
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 56c1dae4..0ec741ad 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -132,6 +132,12 @@ R"(
#define USE_CL_MAD 0
#endif
+// By default the workgroup size requirement is enabled. For Qualcomm devices the workgroup size
+// requirement results in worse performance and is disabled (src/utilities/compile.cpp)
+#ifndef RELAX_WORKGROUP_SIZE
+ #define RELAX_WORKGROUP_SIZE 0
+#endif
+
// Sets a variable to zero
#if PRECISION == 3232 || PRECISION == 6464
#define SetToZero(a) a.x = ZERO; a.y = ZERO
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 85cbdc86..06a6773b 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -30,7 +30,11 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
void Xamax(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global singlereal* maxgm, __global unsigned int* imaxgm) {
@@ -96,7 +100,11 @@ void Xamax(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
void XamaxEpilogue(const __global singlereal* restrict maxgm,
const __global unsigned int* restrict imaxgm,
__global unsigned int* imax, const int imax_offset) {
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 42e49d4c..683c6fad 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -30,7 +30,11 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
void Xasum(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
@@ -73,7 +77,11 @@ void Xasum(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
void XasumEpilogue(const __global real* restrict input,
__global real* asum, const int asum_offset) {
__local real lm[WGS2];
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index 772b57f3..a106ed01 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -22,7 +22,11 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void Xaxpy(const int n, const real_arg arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
@@ -37,7 +41,11 @@ void Xaxpy(const int n, const real_arg arg_alpha,
// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
// assumes that 'n' is dividable by 'VW' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XaxpyFaster(const int n, const real_arg arg_alpha,
const __global realV* restrict xgm,
__global realV* ygm) {
@@ -57,7 +65,11 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XaxpyFastest(const int n, const real_arg arg_alpha,
const __global realV* restrict xgm,
__global realV* ygm) {
@@ -75,7 +87,11 @@ void XaxpyFastest(const int n, const real_arg arg_alpha,
// =================================================================================================
// Full version of the kernel with offsets and strided accesses: batched version
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc,
__global real* ygm, const __constant int* y_offsets, const int y_inc) {
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index aed80fc2..493197af 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -22,7 +22,11 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void Xcopy(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
@@ -37,7 +41,11 @@ void Xcopy(const int n,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XcopyFast(const int n,
const __global realV* restrict xgm,
__global realV* ygm) {
diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl
index 1a703d96..64f6eb9d 100644
--- a/src/kernels/level1/xdot.opencl
+++ b/src/kernels/level1/xdot.opencl
@@ -30,7 +30,11 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the sum operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
void Xdot(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
@@ -72,7 +76,11 @@ void Xdot(const int n,
// The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
// be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
void XdotEpilogue(const __global real* restrict input,
__global real* dot, const int dot_offset) {
__local real lm[WGS2];
diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl
index 24e0c76c..47bb5170 100644
--- a/src/kernels/level1/xhad.opencl
+++ b/src/kernels/level1/xhad.opencl
@@ -66,7 +66,11 @@ INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
@@ -90,7 +94,11 @@ void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
// assumes that 'n' is dividable by 'VW' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
const __global realV* restrict xgm, const __global realV* restrict ygm,
__global realV* zgm) {
@@ -117,7 +125,11 @@ void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
const __global realV* restrict xgm, const __global realV* restrict ygm,
__global realV* zgm) {
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index 6a81c150..36ea49b4 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -30,7 +30,11 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
void Xnrm2(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
@@ -71,7 +75,11 @@ void Xnrm2(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
void Xnrm2Epilogue(const __global real* restrict input,
__global real* nrm2, const int nrm2_offset) {
__local real lm[WGS2];
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index cb133e88..e4260c7c 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -22,7 +22,11 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void Xscal(const int n, const real_arg arg_alpha,
__global real* xgm, const int x_offset, const int x_inc) {
const real alpha = GetRealArg(arg_alpha);
@@ -40,7 +44,11 @@ void Xscal(const int n, const real_arg arg_alpha,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XscalFast(const int n, const real_arg arg_alpha,
__global realV* xgm) {
const real alpha = GetRealArg(arg_alpha);
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index bf5b6194..2d384423 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -22,7 +22,11 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void Xswap(const int n,
__global real* xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
@@ -39,7 +43,11 @@ void Xswap(const int n,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XswapFast(const int n,
__global realV* xgm,
__global realV* ygm) {
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index ba29aba6..15912a60 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -210,7 +210,11 @@ INLINE_FUNC real LoadMatrixA(const __global real* restrict agm, const int x, con
// =================================================================================================
// Full version of the kernel
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
void Xgemv(const int m, const int n,
const real_arg arg_alpha,
const real_arg arg_beta,
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 45ceb36c..46087af7 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -88,7 +88,11 @@ INLINE_FUNC realVF LoadMatrixAVF(const __global realVF* restrict agm, const int
// --> 'a_ld' is a multiple of VW2
// --> 'a_rotated' is 0
// --> 'do_conjugate' is 0
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
void XgemvFast(const int m, const int n,
const real_arg arg_alpha,
const real_arg arg_beta,
@@ -191,7 +195,11 @@ void XgemvFast(const int m, const int n,
// --> 'a_ld' is a multiple of VW3
// --> 'a_rotated' is 1
// --> 'do_conjugate' is 0
-__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
+#endif
void XgemvFastRot(const int m, const int n,
const real_arg arg_alpha,
const real_arg arg_beta,
diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl
index ca6071cd..b1fe8447 100644
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@@ -18,7 +18,11 @@ R"(
// =================================================================================================
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
-__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#endif
void Xger(const int max1, const int max2,
const real_arg arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index 8a57bdfc..eac8c10a 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -18,7 +18,11 @@ R"(
// =================================================================================================
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
-__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#endif
void Xher(const int n,
const real_arg arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index 73305149..9e7f3c6c 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -18,7 +18,11 @@ R"(
// =================================================================================================
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
-__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+#endif
void Xher2(const int n,
const real_arg arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
index e7b6ae79..7677377e 100644
--- a/src/kernels/level2/xtrsv.opencl
+++ b/src/kernels/level2/xtrsv.opencl
@@ -39,7 +39,11 @@ void FillVector(const int n, const int inc, const int offset,
// =================================================================================================
-__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+#endif
void trsv_forward(int n,
const __global real *A, const int a_offset, int a_ld,
__global real *b, const int b_offset, int b_inc,
@@ -87,7 +91,11 @@ void trsv_forward(int n,
}
}
-__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+#endif
void trsv_backward(int n,
const __global real *A, const int a_offset, int a_ld,
__global real *b, const int b_offset, int b_inc,
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index 0e89b78b..4bb61f4e 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -21,7 +21,11 @@ R"(
// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void HermLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -60,7 +64,11 @@ void HermLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void HermUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index 83ecdd65..264bf9c5 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -20,7 +20,11 @@ R"(
// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void SymmLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -53,7 +57,11 @@ void SymmLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void SymmUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index a9d5e769..092e1cf2 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -20,7 +20,11 @@ R"(
// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void TriaLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -55,7 +59,11 @@ void TriaLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void TriaUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index ef8a9017..04e4b2ab 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -35,7 +35,11 @@ R"(
// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
void CopyMatrixFast(const int ld,
__global const realC* restrict src,
__global realC* dest,
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index 3d389b74..dca93b76 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -59,7 +59,11 @@ INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two,
}
// Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyPadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -118,7 +122,11 @@ INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two,
}
// Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -138,7 +146,11 @@ void CopyMatrix(const int src_one, const int src_two,
#if defined(ROUTINE_GEMMBATCHED)
// Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyPadMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
@@ -156,7 +168,11 @@ void CopyPadMatrixBatched(const int src_one, const int src_two,
}
// Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
@@ -177,7 +193,11 @@ void CopyMatrixBatched(const int src_one, const int src_two,
#if defined(ROUTINE_GEMMSTRIDEDBATCHED)
// Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyPadMatrixStridedBatched(const int src_one, const int src_two,
const int src_ld, const int src_offset,
const int src_stride, __global const real* restrict src,
@@ -195,7 +215,11 @@ void CopyPadMatrixStridedBatched(const int src_one, const int src_two,
}
// Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+#endif
void CopyMatrixStridedBatched(const int src_one, const int src_two,
const int src_ld, const int src_offset,
const int src_stride, __global const real* restrict src,
diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index c1f96bd7..580f7b8b 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -82,7 +82,11 @@ R"(
// =================================================================================================
// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
-__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+#endif
void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld,
__global real* restrict dest, const int outer_block_size,
const int unit_diagonal, const int is_upper)
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index 1b9fca45..9d1b7552 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -36,7 +36,11 @@ R"(
// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
// offset. A more general version is available in 'padtranspose.opencl'.
-__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+#endif
void TransposeMatrixFast(const int ld,
__global const realT* restrict src,
__global realT* dest,
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index e55a8b7c..3877a3d5 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -84,7 +84,11 @@ INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile,
}
// Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposePadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -172,7 +176,11 @@ INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile,
}
// Interface to the above function
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposeMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
@@ -193,7 +201,11 @@ void TransposeMatrix(const int src_one, const int src_two,
#if defined(ROUTINE_GEMMBATCHED)
// Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposePadMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
@@ -212,7 +224,11 @@ void TransposePadMatrixBatched(const int src_one, const int src_two,
}
// Batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposeMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
@@ -234,7 +250,11 @@ void TransposeMatrixBatched(const int src_one, const int src_two,
#if defined(ROUTINE_GEMMSTRIDEDBATCHED)
// Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposePadMatrixStridedBatched(const int src_one, const int src_two,
const int src_ld, const int src_offset,
const int src_stride, __global const real* restrict src,
@@ -253,7 +273,11 @@ void TransposePadMatrixStridedBatched(const int src_one, const int src_two,
}
// Strided-batched version of the above
-__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+#endif
void TransposeMatrixStridedBatched(const int src_one, const int src_two,
const int src_ld, const int src_offset,
const int src_stride, __global const real* restrict src,
diff --git a/src/kernels/level3/xgemm_batched.opencl b/src/kernels/level3/xgemm_batched.opencl
index b51e6298..41d07d19 100644
--- a/src/kernels/level3/xgemm_batched.opencl
+++ b/src/kernels/level3/xgemm_batched.opencl
@@ -19,7 +19,11 @@ R"(
// =================================================================================================
#if defined(ROUTINE_GEMMBATCHED)
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas,
const __constant real_arg* arg_betas,
@@ -62,7 +66,11 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
// =================================================================================================
#if defined(ROUTINE_GEMMSTRIDEDBATCHED)
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
void XgemmStridedBatched(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realM* restrict agm, const int a_one, const int a_two,
diff --git a/src/kernels/level3/xgemm_direct_batched.opencl b/src/kernels/level3/xgemm_direct_batched.opencl
index d15ed31e..102ae762 100644
--- a/src/kernels/level3/xgemm_direct_batched.opencl
+++ b/src/kernels/level3/xgemm_direct_batched.opencl
@@ -20,7 +20,11 @@ R"(
#if defined(ROUTINE_GEMMBATCHED)
// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -41,7 +45,11 @@ void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -62,7 +70,11 @@ void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -83,7 +95,11 @@ void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
@@ -108,7 +124,11 @@ void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
#if defined(ROUTINE_GEMMSTRIDEDBATCHED)
// Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -127,7 +147,11 @@ void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int k
}
// Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -146,7 +170,11 @@ void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int k
}
// Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
@@ -165,7 +193,11 @@ void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int k
}
// Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectStridedBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld, const int a_stride,
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
index 0822c95f..5508170e 100644
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -218,7 +218,11 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize
// =================================================================================================
// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
@@ -233,7 +237,11 @@ void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
@@ -248,7 +256,11 @@ void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
@@ -263,7 +275,11 @@ void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
}
// Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
diff --git a/src/kernels/level3/xgemm_part4.opencl b/src/kernels/level3/xgemm_part4.opencl
index b1f1ade6..05524337 100644
--- a/src/kernels/level3/xgemm_part4.opencl
+++ b/src/kernels/level3/xgemm_part4.opencl
@@ -19,7 +19,11 @@ R"(
#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
// Main entry point of the kernel. This is the upper-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
void XgemmUpper(const int kSizeN, const int kSizeK,
const real_arg arg_alpha,
const real_arg arg_beta,
@@ -55,7 +59,11 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
}
// Main entry point of the kernel. This is the lower-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
void XgemmLower(const int kSizeN, const int kSizeK,
const real_arg arg_alpha,
const real_arg arg_beta,
@@ -95,7 +103,11 @@ void XgemmLower(const int kSizeN, const int kSizeK,
#else
// Main entry point of the kernel. This is the regular full version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+#endif
void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha,
const real_arg arg_beta,
diff --git a/src/kernels/levelx/col2im.opencl b/src/kernels/levelx/col2im.opencl
index 425ffbbc..ab0ffbfa 100644
--- a/src/kernels/levelx/col2im.opencl
+++ b/src/kernels/levelx/col2im.opencl
@@ -92,7 +92,11 @@ INLINE_FUNC void Xcol2im(const int input_h, const int input_w, const int channel
// =================================================================================================
// Kernel flip version of the Xcol2im kernel (for convolution)
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
void Xcol2imKernelFlip(const int input_h, const int input_w, const int channels,
const int output_h, const int output_w,
const int kernel_h, const int kernel_w,
@@ -113,7 +117,11 @@ void Xcol2imKernelFlip(const int input_h, const int input_w, const int channels,
}
// Normal version of the Xcol2im kernel (for cross-correlation)
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
void Xcol2imKernelNormal(const int input_h, const int input_w, const int channels,
const int output_h, const int output_w,
const int kernel_h, const int kernel_w,
diff --git a/src/kernels/levelx/im2col.opencl b/src/kernels/levelx/im2col.opencl
index 5db4cb5f..59af38fc 100644
--- a/src/kernels/levelx/im2col.opencl
+++ b/src/kernels/levelx/im2col.opencl
@@ -74,7 +74,11 @@ INLINE_FUNC void Xim2col(const int input_h, const int input_w, const int channel
// =================================================================================================
// Kernel flip version of the Xim2col kernel (for convolution)
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
void Xim2colKernelFlip(const int input_h, const int input_w, const int channels,
const int output_h, const int output_w,
const int kernel_h, const int kernel_w,
@@ -91,7 +95,11 @@ void Xim2colKernelFlip(const int input_h, const int input_w, const int channels,
}
// Normal version of the Xim2col kernel (for cross-correlation)
-__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+#endif
void Xim2colKernelNormal(const int input_h, const int input_w, const int channels,
const int output_h, const int output_w,
const int kernel_h, const int kernel_w,
diff --git a/src/kernels/levelx/xconvgemm_part2.opencl b/src/kernels/levelx/xconvgemm_part2.opencl
index 693cb120..38ddd7eb 100644
--- a/src/kernels/levelx/xconvgemm_part2.opencl
+++ b/src/kernels/levelx/xconvgemm_part2.opencl
@@ -23,7 +23,11 @@ R"(
// ConvGEMM kernel
#if defined(CONVGEMM_WITH_IM2COL)
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void Xconvgemm(const int num_patches, const int num_kernels, const int patch_size,
const __global realND* restrict kernelgm, const int kernel_offset,
__global real* resultgm, const int result_offset, const int result_stride,
@@ -285,7 +289,11 @@ INLINE_FUNC void Xconvgemm(const int num_patches, const int num_kernels, const i
}
#if !defined(CONVGEMM_WITH_IM2COL)
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch_size,
const __global realND* restrict kernelgm, const int kernel_offset,
__global real* resultgm, const int result_offset, const int result_stride,
@@ -306,7 +314,11 @@ void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch
output_h, output_w, alm, blm, kernel_flip);
}
-__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+#endif
void XconvgemmNormal(const int num_patches, const int num_kernels, const int patch_size,
const __global realND* restrict kernelgm, const int kernel_offset,
__global real* resultgm, const int result_offset, const int result_stride,
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index aeb7a3e2..59aa6107 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -37,13 +37,13 @@ std::shared_ptr<Program> CompileFromSource(
// Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
// which it is known to work with all OpenCL platforms.
- if (device.IsNVIDIA() || device.IsARM()) {
+ if (device.IsNVIDIA() || device.IsARM() || device.IsQualcomm()) {
header_string += "#define USE_INLINE_KEYWORD 1\n";
}
// For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
- if (device.IsAMD() && device.IsGPU()) {
+ if ((device.IsAMD() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) {
header_string += "#define USE_CL_MAD 1\n";
}
@@ -54,7 +54,7 @@ std::shared_ptr<Program> CompileFromSource(
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
- if (device.IsARM() && device.IsGPU()) {
+ if ((device.IsARM() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) {
header_string += "#define GLOBAL_MEM_FENCE 1\n";
}
@@ -77,6 +77,12 @@ std::shared_ptr<Program> CompileFromSource(
header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n";
}
}
+
+ // For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance.
+ // This option compiles without the workgroup size requirement and does not affect correctness.
+ if (device.IsQualcomm()) {
+ header_string += "#define RELAX_WORKGROUP_SIZE 1\n";
+ }
// Optionally adds a translation header from OpenCL kernels to CUDA kernels
#ifdef CUDA_API
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index 29161e74..fbdcf9c2 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -463,6 +463,9 @@ std::string GetDeviceArchitecture(const Device& device) {
else if (device.HasExtension(kKhronosAttributesAMD)) {
device_architecture = device.Name(); // Name is architecture for AMD APP and AMD ROCm
}
+ else if ((device.IsQualcomm() && device.IsGPU())) { // queries the Adreno GPU architecture version
+ device_architecture = device.AdrenoVersion();
+ }
// Note: no else - 'device_architecture' might be the empty string
#endif