implemented changes to boost Adreno performance according to https://jira-dc.qualcomm.com/jira/browse/OSR-8731

author: Angus, Alexander <aangus@qti.qualcomm.com> 2023-01-03 10:56:04 -0800
committer: Angus, Alexander <aangus@qti.qualcomm.com> 2023-01-03 10:56:04 -0800
commit: 4f394608a28f419dfd6091c704148d9e638a26f0 (patch)
tree: 4c0e042109c4d249ff5b700fc49a862169edec5a /src/kernels/level1
parent: 03cffa83c5f7742f8ec0c5e762bb7048e38952f3 (diff)
9 files changed, 105 insertions, 21 deletions
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 85cbdc86..3600b9d2 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -30,7 +30,11 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xamax(const int n,
            const __global real* restrict xgm, const int x_offset, const int x_inc,
            __global singlereal* maxgm, __global unsigned int* imaxgm) {
@@ -96,7 +100,11 @@ void Xamax(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void XamaxEpilogue(const __global singlereal* restrict maxgm,
                    const __global unsigned int* restrict imaxgm,
                    __global unsigned int* imax, const int imax_offset) {
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 42e49d4c..875221f4 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -30,7 +30,11 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xasum(const int n,
            const __global real* restrict xgm, const int x_offset, const int x_inc,
            __global real* output) {
@@ -73,7 +77,11 @@ void Xasum(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void XasumEpilogue(const __global real* restrict input,
                    __global real* asum, const int asum_offset) {
   __local real lm[WGS2];
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index 772b57f3..b20ad200 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -22,7 +22,11 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xaxpy(const int n, const real_arg arg_alpha,
            const __global real* restrict xgm, const int x_offset, const int x_inc,
            __global real* ygm, const int y_offset, const int y_inc) {
@@ -37,7 +41,11 @@ void Xaxpy(const int n, const real_arg arg_alpha,
 
 // Faster version of the kernel without offsets and strided accesses but with if-statement. Also
 // assumes that 'n' is dividable by 'VW' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XaxpyFaster(const int n, const real_arg arg_alpha,
                  const __global realV* restrict xgm,
                  __global realV* ygm) {
@@ -57,7 +65,11 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XaxpyFastest(const int n, const real_arg arg_alpha,
                   const __global realV* restrict xgm,
                   __global realV* ygm) {
@@ -75,7 +87,11 @@ void XaxpyFastest(const int n, const real_arg arg_alpha,
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses: batched version
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
                   const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc,
                   __global real* ygm, const __constant int* y_offsets, const int y_inc) {
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index aed80fc2..174bf0c6 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -22,7 +22,11 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xcopy(const int n,
            const __global real* restrict xgm, const int x_offset, const int x_inc,
            __global real* ygm, const int y_offset, const int y_inc) {
@@ -37,7 +41,11 @@ void Xcopy(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XcopyFast(const int n,
                const __global realV* restrict xgm,
                __global realV* ygm) {
diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl
index 1a703d96..e14b6306 100644
--- a/src/kernels/level1/xdot.opencl
+++ b/src/kernels/level1/xdot.opencl
@@ -30,7 +30,11 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the sum operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xdot(const int n,
           const __global real* restrict xgm, const int x_offset, const int x_inc,
           const __global real* restrict ygm, const int y_offset, const int y_inc,
@@ -72,7 +76,11 @@ void Xdot(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void XdotEpilogue(const __global real* restrict input,
                   __global real* dot, const int dot_offset) {
   __local real lm[WGS2];
diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl
index 24e0c76c..aee98f91 100644
--- a/src/kernels/level1/xhad.opencl
+++ b/src/kernels/level1/xhad.opencl
@@ -66,7 +66,11 @@ INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
           const __global real* restrict xgm, const int x_offset, const int x_inc,
           const __global real* restrict ygm, const int y_offset, const int y_inc,
@@ -90,7 +94,11 @@ void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
 
 // Faster version of the kernel without offsets and strided accesses but with if-statement. Also
 // assumes that 'n' is dividable by 'VW' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
                 const __global realV* restrict xgm, const __global realV* restrict ygm,
                 __global realV* zgm) {
@@ -117,7 +125,11 @@ void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
                  const __global realV* restrict xgm, const __global realV* restrict ygm,
                  __global realV* zgm) {
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index 6a81c150..fb45effb 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -30,7 +30,11 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xnrm2(const int n,
            const __global real* restrict xgm, const int x_offset, const int x_inc,
            __global real* output) {
@@ -71,7 +75,11 @@ void Xnrm2(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void Xnrm2Epilogue(const __global real* restrict input,
                    __global real* nrm2, const int nrm2_offset) {
   __local real lm[WGS2];
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index cb133e88..19ca9135 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -22,7 +22,11 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xscal(const int n, const real_arg arg_alpha,
            __global real* xgm, const int x_offset, const int x_inc) {
   const real alpha = GetRealArg(arg_alpha);
@@ -40,7 +44,11 @@ void Xscal(const int n, const real_arg arg_alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XscalFast(const int n, const real_arg arg_alpha,
                __global realV* xgm) {
   const real alpha = GetRealArg(arg_alpha);
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index bf5b6194..a2b44de3 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -22,7 +22,11 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xswap(const int n,
            __global real* xgm, const int x_offset, const int x_inc,
            __global real* ygm, const int y_offset, const int y_inc) {
@@ -39,7 +43,11 @@ void Xswap(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#elif
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XswapFast(const int n,
                __global realV* xgm,
                __global realV* ygm) {
author	Angus, Alexander <aangus@qti.qualcomm.com>	2023-01-03 10:56:04 -0800
committer	Angus, Alexander <aangus@qti.qualcomm.com>	2023-01-03 10:56:04 -0800
commit	4f394608a28f419dfd6091c704148d9e638a26f0 (patch)
tree	4c0e042109c4d249ff5b700fc49a862169edec5a /src/kernels/level1
parent	03cffa83c5f7742f8ec0c5e762bb7048e38952f3 (diff)