summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCNugteren <web@cedricnugteren.nl>2015-08-22 14:33:48 +0200
committerCNugteren <web@cedricnugteren.nl>2015-08-22 14:33:48 +0200
commit75517353d505de1d3979866060261a666aebfd36 (patch)
tree571d0ef02c95e8567099bc55af88085189212f3c /src
parent70ba7c83d4b5a786264fe547f486840de594950f (diff)
Re-organized level1 xaxpy kernel
Diffstat (limited to 'src')
-rw-r--r--src/kernels/level1/level1.opencl (renamed from src/kernels/xaxpy.opencl)34
-rw-r--r--src/kernels/level1/xaxpy.opencl55
-rw-r--r--src/routines/level1/xaxpy.cc3
-rw-r--r--src/tuning/xaxpy.cc3
4 files changed, 60 insertions, 35 deletions
diff --git a/src/kernels/xaxpy.opencl b/src/kernels/level1/level1.opencl
index b7ffe9ff..449a20a2 100644
--- a/src/kernels/xaxpy.opencl
+++ b/src/kernels/level1/level1.opencl
@@ -7,9 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
-// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
-// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
-// support vector data-types.
+// This file contains the common functions and parameters specific for level 1 BLAS kernels.
//
// =================================================================================================
@@ -92,36 +90,6 @@ inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) {
// =================================================================================================
-// Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xaxpy(const int n, const real alpha,
- const __global real* restrict xgm, const int x_offset, const int x_inc,
- __global real* ygm, const int y_offset, const int y_inc) {
-
- // Loops over the work that needs to be done (allows for an arbitrary number of threads)
- #pragma unroll
- for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
- MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xgm[id*x_inc + x_offset]);
- }
-}
-
-// =================================================================================================
-
-// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
-// dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XaxpyFast(const int n, const real alpha,
- const __global realV* restrict xgm,
- __global realV* ygm) {
- #pragma unroll
- for (int w=0; w<WPT; ++w) {
- const int id = w*get_global_size(0) + get_global_id(0);
- ygm[id] = MultiplyAddVector(ygm[id], alpha, xgm[id]);
- }
-}
-
-// =================================================================================================
-
// End of the C++11 raw string literal
)"
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
new file mode 100644
index 00000000..3d926d9e
--- /dev/null
+++ b/src/kernels/level1/xaxpy.opencl
@@ -0,0 +1,55 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
+// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
+// support vector data-types.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Full version of the kernel with offsets and strided accesses
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void Xaxpy(const int n, const real alpha,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* ygm, const int y_offset, const int y_inc) {
+
+ // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+ #pragma unroll
+ for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+ MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xgm[id*x_inc + x_offset]);
+ }
+}
+
+// =================================================================================================
+
+// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
+// dividable by 'VW', 'WGS' and 'WPT'.
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void XaxpyFast(const int n, const real alpha,
+ const __global realV* restrict xgm,
+ __global realV* ygm) {
+ #pragma unroll
+ for (int w=0; w<WPT; ++w) {
+ const int id = w*get_global_size(0) + get_global_id(0);
+ ygm[id] = MultiplyAddVector(ygm[id], alpha, xgm[id]);
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc
index 7646b0e4..ce138fa6 100644
--- a/src/routines/level1/xaxpy.cc
+++ b/src/routines/level1/xaxpy.cc
@@ -32,7 +32,8 @@ template <typename T>
Xaxpy<T>::Xaxpy(Queue &queue, Event &event):
Routine<T>(queue, event, "AXPY", {"Xaxpy"}, precision_) {
source_string_ =
- #include "../../kernels/xaxpy.opencl"
+ #include "../../kernels/level1/level1.opencl"
+ #include "../../kernels/level1/xaxpy.opencl"
;
}
diff --git a/src/tuning/xaxpy.cc b/src/tuning/xaxpy.cc
index cc9e81d3..7715b128 100644
--- a/src/tuning/xaxpy.cc
+++ b/src/tuning/xaxpy.cc
@@ -31,7 +31,8 @@ class TuneXaxpy {
static std::string GetSources() {
return
#include "../src/kernels/common.opencl"
- #include "../src/kernels/xaxpy.opencl"
+ #include "../src/kernels/level1/level1.opencl"
+ #include "../src/kernels/level1/xaxpy.opencl"
;
}