Re-organized level1 xaxpy kernel

author: CNugteren <web@cedricnugteren.nl> 2015-08-22 14:33:48 +0200
committer: CNugteren <web@cedricnugteren.nl> 2015-08-22 14:33:48 +0200
commit: 75517353d505de1d3979866060261a666aebfd36 (patch)
tree: 571d0ef02c95e8567099bc55af88085189212f3c /src/kernels
parent: 70ba7c83d4b5a786264fe547f486840de594950f (diff)
2 files changed, 56 insertions, 33 deletions
diff --git a/src/kernels/xaxpy.opencl b/src/kernels/level1/level1.opencl
index b7ffe9ff..449a20a2 100644
--- a/src/kernels/xaxpy.opencl
+++ b/src/kernels/level1/level1.opencl
@@ -7,9 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
-// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
-// support vector data-types.
+// This file contains the common functions and parameters specific for level 1 BLAS kernels.
 //
 // =================================================================================================
 
@@ -92,36 +90,6 @@ inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) {
 
 // =================================================================================================
 
-// Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xaxpy(const int n, const real alpha,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* ygm, const int y_offset, const int y_inc) {
-
-  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
-  #pragma unroll
-  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
-    MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xgm[id*x_inc + x_offset]);
-  }
-}
-
-// =================================================================================================
-
-// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
-// dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XaxpyFast(const int n, const real alpha,
-                         const __global realV* restrict xgm,
-                         __global realV* ygm) {
-  #pragma unroll
-  for (int w=0; w<WPT; ++w) {
-    const int id = w*get_global_size(0) + get_global_id(0);
-    ygm[id] = MultiplyAddVector(ygm[id], alpha, xgm[id]);
-  }
-}
-
-// =================================================================================================
-
 // End of the C++11 raw string literal
 )"
 
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
new file mode 100644
index 00000000..3d926d9e
--- /dev/null
+++ b/src/kernels/level1/xaxpy.opencl
@@ -0,0 +1,55 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
+// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
+// support vector data-types.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void Xaxpy(const int n, const real alpha,
+                    const __global real* restrict xgm, const int x_offset, const int x_inc,
+                    __global real* ygm, const int y_offset, const int y_inc) {
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  #pragma unroll
+  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+    MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xgm[id*x_inc + x_offset]);
+  }
+}
+
+// =================================================================================================
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void XaxpyFast(const int n, const real alpha,
+                         const __global realV* restrict xgm,
+                         __global realV* ygm) {
+  #pragma unroll
+  for (int w=0; w<WPT; ++w) {
+    const int id = w*get_global_size(0) + get_global_id(0);
+    ygm[id] = MultiplyAddVector(ygm[id], alpha, xgm[id]);
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
author	CNugteren <web@cedricnugteren.nl>	2015-08-22 14:33:48 +0200
committer	CNugteren <web@cedricnugteren.nl>	2015-08-22 14:33:48 +0200
commit	75517353d505de1d3979866060261a666aebfd36 (patch)
tree	571d0ef02c95e8567099bc55af88085189212f3c /src/kernels
parent	70ba7c83d4b5a786264fe547f486840de594950f (diff)