summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-10-15 20:08:29 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2018-10-15 20:08:29 +0200
commit664a238adfd581dd1b2e6ed94592ba956ae117dc (patch)
tree8cd732633e0bb3e8c3a8e12a652d789bcf0e73f9
parent634b2bc75c74b80cdd2c60f87472d6f8f467cbba (diff)
Fixed a bug in the XaxpyFaster kernel for specific parameters
-rw-r--r--CHANGELOG1
-rw-r--r--src/kernels/level1/xaxpy.opencl5
2 files changed, 4 insertions, 2 deletions
diff --git a/CHANGELOG b/CHANGELOG
index bc856357..18c9051d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -7,6 +7,7 @@ Development (next version)
- Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY
- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
- Fixed an issue with the preprocessor and the new GEMMK == 1 kernel
+- Fixed an issue for certain parameters for AXPY's 'XaxpyFaster' kernel
- Various minor fixes and enhancements
- Added non-BLAS routines:
* SCONVGEMM/DCONVGEMM/HCONVGEMM (convolution as im2col followed by batched GEMM)
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index 74e49930..2829237e 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -43,10 +43,11 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,
__global realV* ygm) {
const real alpha = GetRealArg(arg_alpha);
- if (get_global_id(0) < n / (VW)) {
+ const int num_worker_threads = n / (VW * WPT);
+ if (get_global_id(0) < num_worker_threads) {
#pragma unroll
for (int _w = 0; _w < WPT; _w += 1) {
- const int id = _w*get_global_size(0) + get_global_id(0);
+ const int id = _w*num_worker_threads + get_global_id(0);
realV xvalue = xgm[id];
realV yvalue = ygm[id];
ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);