Added the XSWAP, XSCAL and XCOPY level-1 routines

author: CNugteren <web@cedricnugteren.nl> 2015-08-22 17:11:20 +0200
committer: CNugteren <web@cedricnugteren.nl> 2015-08-22 17:11:20 +0200
commit: ff0c54c3865b45eff807315262e73d3f01cb19c3 (patch)
tree: 839e9def73fb068f988b07e1e879ecce48d884c8 /src/kernels/level1
parent: 75517353d505de1d3979866060261a666aebfd36 (diff)
5 files changed, 223 insertions, 2 deletions
diff --git a/src/kernels/level1/level1.opencl b/src/kernels/level1/level1.opencl
index 449a20a2..7e10426b 100644
--- a/src/kernels/level1/level1.opencl
+++ b/src/kernels/level1/level1.opencl
@@ -46,6 +46,48 @@ R"(
 
 // =================================================================================================
 
+// The vectorized multiply function
+inline realV MultiplyVector(realV cvec, const real aval, const realV bvec) {
+  #if VW == 1
+    Multiply(cvec, aval, bvec);
+  #elif VW == 2
+    Multiply(cvec.x, aval, bvec.x);
+    Multiply(cvec.y, aval, bvec.y);
+  #elif VW == 4
+    Multiply(cvec.x, aval, bvec.x);
+    Multiply(cvec.y, aval, bvec.y);
+    Multiply(cvec.z, aval, bvec.z);
+    Multiply(cvec.w, aval, bvec.w);
+  #elif VW == 8
+    Multiply(cvec.s0, aval, bvec.s0);
+    Multiply(cvec.s1, aval, bvec.s1);
+    Multiply(cvec.s2, aval, bvec.s2);
+    Multiply(cvec.s3, aval, bvec.s3);
+    Multiply(cvec.s4, aval, bvec.s4);
+    Multiply(cvec.s5, aval, bvec.s5);
+    Multiply(cvec.s6, aval, bvec.s6);
+    Multiply(cvec.s7, aval, bvec.s7);
+  #elif VW == 16
+    Multiply(cvec.s0, aval, bvec.s0);
+    Multiply(cvec.s1, aval, bvec.s1);
+    Multiply(cvec.s2, aval, bvec.s2);
+    Multiply(cvec.s3, aval, bvec.s3);
+    Multiply(cvec.s4, aval, bvec.s4);
+    Multiply(cvec.s5, aval, bvec.s5);
+    Multiply(cvec.s6, aval, bvec.s6);
+    Multiply(cvec.s7, aval, bvec.s7);
+    Multiply(cvec.s8, aval, bvec.s8);
+    Multiply(cvec.s9, aval, bvec.s9);
+    Multiply(cvec.sA, aval, bvec.sA);
+    Multiply(cvec.sB, aval, bvec.sB);
+    Multiply(cvec.sC, aval, bvec.sC);
+    Multiply(cvec.sD, aval, bvec.sD);
+    Multiply(cvec.sE, aval, bvec.sE);
+    Multiply(cvec.sF, aval, bvec.sF);
+  #endif
+  return cvec;
+}
+
 // The vectorized multiply-add function
 inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) {
   #if VW == 1
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index 3d926d9e..1f1e8ce0 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -11,6 +11,8 @@
 // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
 // support vector data-types.
 //
+// This kernel uses the level-1 BLAS common tuning parameters.
+//
 // =================================================================================================
 
 // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
@@ -38,8 +40,8 @@ __kernel void Xaxpy(const int n, const real alpha,
 // dividable by 'VW', 'WGS' and 'WPT'.
 __attribute__((reqd_work_group_size(WGS, 1, 1)))
 __kernel void XaxpyFast(const int n, const real alpha,
-                         const __global realV* restrict xgm,
-                         __global realV* ygm) {
+                        const __global realV* restrict xgm,
+                        __global realV* ygm) {
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
     const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
new file mode 100644
index 00000000..97c27ccf
--- /dev/null
+++ b/src/kernels/level1/xcopy.opencl
@@ -0,0 +1,57 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xcopy kernel. It contains one fast vectorized version in case of unit
+// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
+// support vector data-types.
+//
+// This kernel uses the level-1 BLAS common tuning parameters.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void Xcopy(const int n,
+                    const __global real* restrict xgm, const int x_offset, const int x_inc,
+                    __global real* ygm, const int y_offset, const int y_inc) {
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  #pragma unroll
+  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+    ygm[id*y_inc + y_offset] = xgm[id*x_inc + x_offset];
+  }
+}
+
+// =================================================================================================
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void XcopyFast(const int n,
+                        const __global realV* restrict xgm,
+                        __global realV* ygm) {
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id = w*get_global_size(0) + get_global_id(0);
+    ygm[id] = xgm[id];
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
new file mode 100644
index 00000000..956de3c0
--- /dev/null
+++ b/src/kernels/level1/xscal.opencl
@@ -0,0 +1,59 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xscal kernel. It contains one fast vectorized version in case of unit
+// strides (incx=1) and no offsets (offx=0). Another version is more general, but doesn't support
+// vector data-types.
+//
+// This kernel uses the level-1 BLAS common tuning parameters.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void Xscal(const int n, const real alpha,
+                    __global real* xgm, const int x_offset, const int x_inc) {
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  #pragma unroll
+  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+    real result;
+    Multiply(result, alpha, xgm[id*x_inc + x_offset]);
+    xgm[id*x_inc + x_offset] = result;
+  }
+}
+
+// =================================================================================================
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void XscalFast(const int n, const real alpha,
+                        __global realV* xgm) {
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id = w*get_global_size(0) + get_global_id(0);
+    realV result;
+    result = MultiplyVector(result, alpha, xgm[id]);
+    xgm[id] = result;
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
new file mode 100644
index 00000000..f6487b58
--- /dev/null
+++ b/src/kernels/level1/xswap.opencl
@@ -0,0 +1,61 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xswap kernel. It contains one fast vectorized version in case of unit
+// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
+// support vector data-types.
+//
+// This kernel uses the level-1 BLAS common tuning parameters.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void Xswap(const int n,
+                    __global real* xgm, const int x_offset, const int x_inc,
+                    __global real* ygm, const int y_offset, const int y_inc) {
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  #pragma unroll
+  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+    real temp = xgm[id*x_inc + x_offset];
+    xgm[id*x_inc + x_offset] = ygm[id*y_inc + y_offset];
+    ygm[id*y_inc + y_offset] = temp;
+  }
+}
+
+// =================================================================================================
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void XswapFast(const int n,
+                        __global realV* xgm,
+                        __global realV* ygm) {
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id = w*get_global_size(0) + get_global_id(0);
+    realV temp = xgm[id];
+    xgm[id] = ygm[id];
+    ygm[id] = temp;
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
author	CNugteren <web@cedricnugteren.nl>	2015-08-22 17:11:20 +0200
committer	CNugteren <web@cedricnugteren.nl>	2015-08-22 17:11:20 +0200
commit	ff0c54c3865b45eff807315262e73d3f01cb19c3 (patch)
tree	839e9def73fb068f988b07e1e879ecce48d884c8 /src/kernels/level1
parent	75517353d505de1d3979866060261a666aebfd36 (diff)