diff options
author | Angus, Alexander <aangus@qti.qualcomm.com> | 2023-01-03 10:56:04 -0800 |
---|---|---|
committer | Angus, Alexander <aangus@qti.qualcomm.com> | 2023-01-03 10:56:04 -0800 |
commit | 4f394608a28f419dfd6091c704148d9e638a26f0 (patch) | |
tree | 4c0e042109c4d249ff5b700fc49a862169edec5a /src/kernels/level1 | |
parent | 03cffa83c5f7742f8ec0c5e762bb7048e38952f3 (diff) |
implemented changes to boost Adreno performance according to https://jira-dc.qualcomm.com/jira/browse/OSR-8731
Diffstat (limited to 'src/kernels/level1')
-rw-r--r-- | src/kernels/level1/xamax.opencl | 12 | ||||
-rw-r--r-- | src/kernels/level1/xasum.opencl | 12 | ||||
-rw-r--r-- | src/kernels/level1/xaxpy.opencl | 24 | ||||
-rw-r--r-- | src/kernels/level1/xcopy.opencl | 12 | ||||
-rw-r--r-- | src/kernels/level1/xdot.opencl | 12 | ||||
-rw-r--r-- | src/kernels/level1/xhad.opencl | 18 | ||||
-rw-r--r-- | src/kernels/level1/xnrm2.opencl | 12 | ||||
-rw-r--r-- | src/kernels/level1/xscal.opencl | 12 | ||||
-rw-r--r-- | src/kernels/level1/xswap.opencl | 12 |
9 files changed, 105 insertions, 21 deletions
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 85cbdc86..3600b9d2 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -30,7 +30,11 @@ R"( // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation -__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#endif void Xamax(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global singlereal* maxgm, __global unsigned int* imaxgm) { @@ -96,7 +100,11 @@ void Xamax(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#endif void XamaxEpilogue(const __global singlereal* restrict maxgm, const __global unsigned int* restrict imaxgm, __global unsigned int* imax, const int imax_offset) { diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl index 42e49d4c..875221f4 100644 --- a/src/kernels/level1/xasum.opencl +++ b/src/kernels/level1/xasum.opencl @@ -30,7 +30,11 @@ R"( // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation -__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#endif void Xasum(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* output) { @@ -73,7 +77,11 @@ void Xasum(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#endif void XasumEpilogue(const __global real* restrict input, __global real* asum, const int asum_offset) { __local real lm[WGS2]; diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index 772b57f3..b20ad200 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -22,7 +22,11 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void Xaxpy(const int n, const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { @@ -37,7 +41,11 @@ void Xaxpy(const int n, const real_arg arg_alpha, // Faster version of the kernel without offsets and strided accesses but with if-statement. Also // assumes that 'n' is dividable by 'VW' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XaxpyFaster(const int n, const real_arg arg_alpha, const __global realV* restrict xgm, __global realV* ygm) { @@ -57,7 +65,11 @@ void XaxpyFaster(const int n, const real_arg arg_alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XaxpyFastest(const int n, const real_arg arg_alpha, const __global realV* restrict xgm, __global realV* ygm) { @@ -75,7 +87,11 @@ void XaxpyFastest(const int n, const real_arg arg_alpha, // ================================================================================================= // Full version of the kernel with offsets and strided accesses: batched version -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XaxpyBatched(const int n, const __constant real_arg* arg_alphas, const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc, __global real* ygm, const __constant int* y_offsets, const int y_inc) { diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl index aed80fc2..174bf0c6 100644 --- a/src/kernels/level1/xcopy.opencl +++ b/src/kernels/level1/xcopy.opencl @@ -22,7 +22,11 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void Xcopy(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { @@ -37,7 +41,11 @@ void Xcopy(const int n, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XcopyFast(const int n, const __global realV* restrict xgm, __global realV* ygm) { diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl index 1a703d96..e14b6306 100644 --- a/src/kernels/level1/xdot.opencl +++ b/src/kernels/level1/xdot.opencl @@ -30,7 +30,11 @@ R"( // ================================================================================================= // The main reduction kernel, performing the multiplication and the majority of the sum operation -__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#endif void Xdot(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, @@ -72,7 +76,11 @@ void Xdot(const int n, // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to // be launched with a single workgroup only. -__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#endif void XdotEpilogue(const __global real* restrict input, __global real* dot, const int dot_offset) { __local real lm[WGS2]; diff --git a/src/kernels/level1/xhad.opencl b/src/kernels/level1/xhad.opencl index 24e0c76c..aee98f91 100644 --- a/src/kernels/level1/xhad.opencl +++ b/src/kernels/level1/xhad.opencl @@ -66,7 +66,11 @@ INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, @@ -90,7 +94,11 @@ void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta, // Faster version of the kernel without offsets and strided accesses but with if-statement. Also // assumes that 'n' is dividable by 'VW' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta, const __global realV* restrict xgm, const __global realV* restrict ygm, __global realV* zgm) { @@ -117,7 +125,11 @@ void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta, const __global realV* restrict xgm, const __global realV* restrict ygm, __global realV* zgm) { diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl index 6a81c150..fb45effb 100644 --- a/src/kernels/level1/xnrm2.opencl +++ b/src/kernels/level1/xnrm2.opencl @@ -30,7 +30,11 @@ R"( // ================================================================================================= // The main reduction kernel, performing the multiplication and the majority of the operation -__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +#endif void Xnrm2(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* output) { @@ -71,7 +75,11 @@ void Xnrm2(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +#endif void Xnrm2Epilogue(const __global real* restrict input, __global real* nrm2, const int nrm2_offset) { __local real lm[WGS2]; diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl index cb133e88..19ca9135 100644 --- a/src/kernels/level1/xscal.opencl +++ b/src/kernels/level1/xscal.opencl @@ -22,7 +22,11 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void Xscal(const int n, const real_arg arg_alpha, __global real* xgm, const int x_offset, const int x_inc) { const real alpha = GetRealArg(arg_alpha); @@ -40,7 +44,11 @@ void Xscal(const int n, const real_arg arg_alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XscalFast(const int n, const real_arg arg_alpha, __global realV* xgm) { const real alpha = GetRealArg(arg_alpha); diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl index bf5b6194..a2b44de3 100644 --- a/src/kernels/level1/xswap.opencl +++ b/src/kernels/level1/xswap.opencl @@ -22,7 +22,11 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void Xswap(const int n, __global real* xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { @@ -39,7 +43,11 @@ void Xswap(const int n, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#if RELAX_WORKGROUP_SIZE == 1 + __kernel +#elif + __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +#endif void XswapFast(const int n, __global realV* xgm, __global realV* ygm) { |