Merge tag '1.6.0' into gspr/post-bookworm

author: Gard Spreemann <gspr@nonempty.org> 2023-06-08 11:52:00 +0200
committer: Gard Spreemann <gspr@nonempty.org> 2023-06-08 11:52:00 +0200
commit: 63870a2e60c1bc8bfa7e3672457b551a8e51ffaf (patch)
tree: fe2c0cd5f62e3fbd17e58d3903ec6bb37983f620 /src/kernels/level1
parent: d31fb141cb597aaf405674621aa25f263aa375e1 (diff)
parent: b0b302889cc786907efb080c4e1beea30d2fa39f (diff)
9 files changed, 107 insertions, 23 deletions
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 85cbdc86..7cbbd6b5 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -30,7 +30,11 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xamax(const int n,
            const __global real* restrict xgm, const int x_offset, const int x_inc,
            __global singlereal* maxgm, __global unsigned int* imaxgm) {
@@ -51,7 +55,7 @@ void Xamax(const int n,
   while (id < n) {
     const int x_index = id*x_inc + x_offset;
     #if PRECISION == 3232 || PRECISION == 6464
-      singlereal x = xgm[x_index].x;
+      singlereal x = fabs(xgm[x_index].x) + fabs(xgm[x_index].y);
     #else
       singlereal x = xgm[x_index];
     #endif
@@ -66,7 +70,7 @@ void Xamax(const int n,
     #endif
     if (x > max) {
       max = x;
-      imax = id*x_inc + x_offset;
+      imax = id;
     }
     id += WGS1*num_groups;
   }
@@ -96,7 +100,11 @@ void Xamax(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void XamaxEpilogue(const __global singlereal* restrict maxgm,
                    const __global unsigned int* restrict imaxgm,
                    __global unsigned int* imax, const int imax_offset) {
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 42e49d4c..683c6fad 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -30,7 +30,11 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xasum(const int n,
            const __global real* restrict xgm, const int x_offset, const int x_inc,
            __global real* output) {
@@ -73,7 +77,11 @@ void Xasum(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void XasumEpilogue(const __global real* restrict input,
                    __global real* asum, const int asum_offset) {
   __local real lm[WGS2];
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index 772b57f3..a106ed01 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -22,7 +22,11 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xaxpy(const int n, const real_arg arg_alpha,
            const __global real* restrict xgm, const int x_offset, const int x_inc,
            __global real* ygm, const int y_offset, const int y_inc) {
@@ -37,7 +41,11 @@ void Xaxpy(const int n, const real_arg arg_alpha,
 
 // Faster version of the kernel without offsets and strided accesses but with if-statement. Also
 // assumes that 'n' is dividable by 'VW' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XaxpyFaster(const int n, const real_arg arg_alpha,
                  const __global realV* restrict xgm,
                  __global realV* ygm) {
@@ -57,7 +65,11 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XaxpyFastest(const int n, const real_arg arg_alpha,
                   const __global realV* restrict xgm,
                   __global realV* ygm) {
@@ -75,7 +87,11 @@ void XaxpyFastest(const int n, const real_arg arg_alpha,
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses: batched version
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
                   const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc,
                   __global real* ygm, const __constant int* y_offsets, const int y_inc) {
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index aed80fc2..493197af 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -22,7 +22,11 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xcopy(const int n,
            const __global real* restrict xgm, const int x_offset, const int x_inc,
            __global real* ygm, const int y_offset, const int y_inc) {
@@ -37,7 +41,11 @@ void Xcopy(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XcopyFast(const int n,
                const __global realV* restrict xgm,
                __global realV* ygm) {
diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl
index 1a703d96..64f6eb9d 100644
--- a/src/kernels/level1/xdot.opencl
+++ b/src/kernels/level1/xdot.opencl
@@ -30,7 +30,11 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the sum operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xdot(const int n,
           const __global real* restrict xgm, const int x_offset, const int x_inc,
           const __global real* restrict ygm, const int y_offset, const int y_inc,
@@ -72,7 +76,11 @@ void Xdot(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void XdotEpilogue(const __global real* restrict input,
                   __global real* dot, const int dot_offset) {
   __local real lm[WGS2];
diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl
index 24e0c76c..47bb5170 100644
--- a/src/kernels/level1/xhad.opencl
+++ b/src/kernels/level1/xhad.opencl
@@ -66,7 +66,11 @@ INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
           const __global real* restrict xgm, const int x_offset, const int x_inc,
           const __global real* restrict ygm, const int y_offset, const int y_inc,
@@ -90,7 +94,11 @@ void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
 
 // Faster version of the kernel without offsets and strided accesses but with if-statement. Also
 // assumes that 'n' is dividable by 'VW' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
                 const __global realV* restrict xgm, const __global realV* restrict ygm,
                 __global realV* zgm) {
@@ -117,7 +125,11 @@ void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
                  const __global realV* restrict xgm, const __global realV* restrict ygm,
                  __global realV* zgm) {
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index 6a81c150..36ea49b4 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -30,7 +30,11 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
 void Xnrm2(const int n,
            const __global real* restrict xgm, const int x_offset, const int x_inc,
            __global real* output) {
@@ -71,7 +75,11 @@ void Xnrm2(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
 void Xnrm2Epilogue(const __global real* restrict input,
                    __global real* nrm2, const int nrm2_offset) {
   __local real lm[WGS2];
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index cb133e88..e4260c7c 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -22,7 +22,11 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xscal(const int n, const real_arg arg_alpha,
            __global real* xgm, const int x_offset, const int x_inc) {
   const real alpha = GetRealArg(arg_alpha);
@@ -40,7 +44,11 @@ void Xscal(const int n, const real_arg arg_alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XscalFast(const int n, const real_arg arg_alpha,
                __global realV* xgm) {
   const real alpha = GetRealArg(arg_alpha);
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index bf5b6194..2d384423 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -22,7 +22,11 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void Xswap(const int n,
            __global real* xgm, const int x_offset, const int x_inc,
            __global real* ygm, const int y_offset, const int y_inc) {
@@ -39,7 +43,11 @@ void Xswap(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+  __kernel
+#else
+  __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
 void XswapFast(const int n,
                __global realV* xgm,
                __global realV* ygm) {
author	Gard Spreemann <gspr@nonempty.org>	2023-06-08 11:52:00 +0200
committer	Gard Spreemann <gspr@nonempty.org>	2023-06-08 11:52:00 +0200
commit	63870a2e60c1bc8bfa7e3672457b551a8e51ffaf (patch)
tree	fe2c0cd5f62e3fbd17e58d3903ec6bb37983f620 /src/kernels/level1
parent	d31fb141cb597aaf405674621aa25f263aa375e1 (diff)
parent	b0b302889cc786907efb080c4e1beea30d2fa39f (diff)