summaryrefslogtreecommitdiff
path: root/src/kernels/level1
diff options
context:
space:
mode:
authorGard Spreemann <gspr@nonempty.org>2023-06-08 11:52:00 +0200
committerGard Spreemann <gspr@nonempty.org>2023-06-08 11:52:00 +0200
commit63870a2e60c1bc8bfa7e3672457b551a8e51ffaf (patch)
treefe2c0cd5f62e3fbd17e58d3903ec6bb37983f620 /src/kernels/level1
parentd31fb141cb597aaf405674621aa25f263aa375e1 (diff)
parentb0b302889cc786907efb080c4e1beea30d2fa39f (diff)
Merge tag '1.6.0' into gspr/post-bookworm
Diffstat (limited to 'src/kernels/level1')
-rw-r--r--src/kernels/level1/xamax.opencl16
-rw-r--r--src/kernels/level1/xasum.opencl12
-rw-r--r--src/kernels/level1/xaxpy.opencl24
-rw-r--r--src/kernels/level1/xcopy.opencl12
-rw-r--r--src/kernels/level1/xdot.opencl12
-rw-r--r--src/kernels/level1/xhad.opencl18
-rw-r--r--src/kernels/level1/xnrm2.opencl12
-rw-r--r--src/kernels/level1/xscal.opencl12
-rw-r--r--src/kernels/level1/xswap.opencl12
9 files changed, 107 insertions, 23 deletions
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 85cbdc86..7cbbd6b5 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -30,7 +30,11 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
void Xamax(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global singlereal* maxgm, __global unsigned int* imaxgm) {
@@ -51,7 +55,7 @@ void Xamax(const int n,
while (id < n) {
const int x_index = id*x_inc + x_offset;
#if PRECISION == 3232 || PRECISION == 6464
- singlereal x = xgm[x_index].x;
+ singlereal x = fabs(xgm[x_index].x) + fabs(xgm[x_index].y);
#else
singlereal x = xgm[x_index];
#endif
@@ -66,7 +70,7 @@ void Xamax(const int n,
#endif
if (x > max) {
max = x;
- imax = id*x_inc + x_offset;
+ imax = id;
}
id += WGS1*num_groups;
}
@@ -96,7 +100,11 @@ void Xamax(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
void XamaxEpilogue(const __global singlereal* restrict maxgm,
const __global unsigned int* restrict imaxgm,
__global unsigned int* imax, const int imax_offset) {
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 42e49d4c..683c6fad 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -30,7 +30,11 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
void Xasum(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
@@ -73,7 +77,11 @@ void Xasum(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
void XasumEpilogue(const __global real* restrict input,
__global real* asum, const int asum_offset) {
__local real lm[WGS2];
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index 772b57f3..a106ed01 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -22,7 +22,11 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void Xaxpy(const int n, const real_arg arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
@@ -37,7 +41,11 @@ void Xaxpy(const int n, const real_arg arg_alpha,
// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
// assumes that 'n' is dividable by 'VW' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XaxpyFaster(const int n, const real_arg arg_alpha,
const __global realV* restrict xgm,
__global realV* ygm) {
@@ -57,7 +65,11 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XaxpyFastest(const int n, const real_arg arg_alpha,
const __global realV* restrict xgm,
__global realV* ygm) {
@@ -75,7 +87,11 @@ void XaxpyFastest(const int n, const real_arg arg_alpha,
// =================================================================================================
// Full version of the kernel with offsets and strided accesses: batched version
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc,
__global real* ygm, const __constant int* y_offsets, const int y_inc) {
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index aed80fc2..493197af 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -22,7 +22,11 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void Xcopy(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
@@ -37,7 +41,11 @@ void Xcopy(const int n,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XcopyFast(const int n,
const __global realV* restrict xgm,
__global realV* ygm) {
diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl
index 1a703d96..64f6eb9d 100644
--- a/src/kernels/level1/xdot.opencl
+++ b/src/kernels/level1/xdot.opencl
@@ -30,7 +30,11 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the sum operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
void Xdot(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
@@ -72,7 +76,11 @@ void Xdot(const int n,
// The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
// be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
void XdotEpilogue(const __global real* restrict input,
__global real* dot, const int dot_offset) {
__local real lm[WGS2];
diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl
index 24e0c76c..47bb5170 100644
--- a/src/kernels/level1/xhad.opencl
+++ b/src/kernels/level1/xhad.opencl
@@ -66,7 +66,11 @@ INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
@@ -90,7 +94,11 @@ void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
// assumes that 'n' is dividable by 'VW' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
const __global realV* restrict xgm, const __global realV* restrict ygm,
__global realV* zgm) {
@@ -117,7 +125,11 @@ void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
const __global realV* restrict xgm, const __global realV* restrict ygm,
__global realV* zgm) {
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index 6a81c150..36ea49b4 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -30,7 +30,11 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the operation
-__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+#endif
void Xnrm2(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
@@ -71,7 +75,11 @@ void Xnrm2(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
-__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+#endif
void Xnrm2Epilogue(const __global real* restrict input,
__global real* nrm2, const int nrm2_offset) {
__local real lm[WGS2];
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index cb133e88..e4260c7c 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -22,7 +22,11 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void Xscal(const int n, const real_arg arg_alpha,
__global real* xgm, const int x_offset, const int x_inc) {
const real alpha = GetRealArg(arg_alpha);
@@ -40,7 +44,11 @@ void Xscal(const int n, const real_arg arg_alpha,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XscalFast(const int n, const real_arg arg_alpha,
__global realV* xgm) {
const real alpha = GetRealArg(arg_alpha);
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index bf5b6194..2d384423 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -22,7 +22,11 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void Xswap(const int n,
__global real* xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
@@ -39,7 +43,11 @@ void Xswap(const int n,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
-__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#if RELAX_WORKGROUP_SIZE == 1
+ __kernel
+#else
+ __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+#endif
void XswapFast(const int n,
__global realV* xgm,
__global realV* ygm) {