summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-04-14 20:16:10 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2017-04-14 20:16:10 +0200
commit10205d773e1477fdd634dbc7e224cc71361a9885 (patch)
treeda6ed72c7f530a02a9cae70938fce4b4670066b2
parent0da1e380974007f69b827f6b10ef0243249d0c5e (diff)
Added a new Xaxpy kernel in between the regular and fast version in
-rw-r--r--src/kernels/level1/xaxpy.opencl25
-rw-r--r--src/routines/level1/xaxpy.cpp20
-rw-r--r--src/tuning/kernels/xaxpy.cpp6
-rw-r--r--test/correctness/testblas.cpp2
4 files changed, 40 insertions, 13 deletions
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index f44bbce0..d30d4e55 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -36,12 +36,31 @@ void Xaxpy(const int n, const real_arg arg_alpha,
}
}
+// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
+// assumes that 'n' is dividable by 'VW' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFaster(const int n, const real_arg arg_alpha,
+ const __global realV* restrict xgm,
+ __global realV* ygm) {
+ const real alpha = GetRealArg(arg_alpha);
+
+ if (get_global_id(0) < n / (VW)) {
+ #pragma unroll
+ for (int w=0; w<WPT; ++w) {
+ const int id = w*get_global_size(0) + get_global_id(0);
+ realV xvalue = xgm[id];
+ realV yvalue = ygm[id];
+ ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);
+ }
+ }
+}
+
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
-void XaxpyFast(const int n, const real_arg arg_alpha,
- const __global realV* restrict xgm,
- __global realV* ygm) {
+void XaxpyFastest(const int n, const real_arg arg_alpha,
+ const __global realV* restrict xgm,
+ __global realV* ygm) {
const real alpha = GetRealArg(arg_alpha);
#pragma unroll
diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp
index 310562a0..0e588d99 100644
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@@ -44,18 +44,21 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
- const auto use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
- (y_offset == 0) && (y_inc == 1) &&
- IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+ const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) &&
+ (y_offset == 0) && (y_inc == 1) &&
+ IsMultiple(n, db_["WPT"]*db_["VW"]);
+ const auto use_fastest_kernel = use_faster_kernel &&
+ IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
// If possible, run the fast-version of the kernel
- const auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
+ const auto kernel_name = (use_fastest_kernel) ? "XaxpyFastest" :
+ (use_faster_kernel) ? "XaxpyFaster" : "Xaxpy";
// Retrieves the Xaxpy kernel from the compiled binary
auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
- if (use_fast_kernel) {
+ if (use_faster_kernel || use_fastest_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
@@ -73,11 +76,16 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
}
// Launches the kernel
- if (use_fast_kernel) {
+ if (use_fastest_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
+ else if (use_faster_kernel) {
+ auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
+ }
else {
const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 23132c51..7984e184 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -27,7 +27,7 @@ class TuneXaxpy {
// The representative kernel and the source code
static std::string KernelFamily() { return "xaxpy"; }
- static std::string KernelName() { return "XaxpyFast"; }
+ static std::string KernelName() { return "XaxpyFastest"; }
static std::string GetSources() {
return
#include "../src/kernels/common.opencl"
@@ -42,7 +42,7 @@ class TuneXaxpy {
// Tests for valid arguments
static void TestValidArguments(const Arguments<T> &args) {
if (!IsMultiple(args.n, 64)) {
- throw std::runtime_error("'XaxpyFast' requires 'n' to be a multiple of WGS*WPT*VW");
+ throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW");
}
}
@@ -52,7 +52,7 @@ class TuneXaxpy {
static size_t DefaultK() { return 1; } // N/A for this kernel
static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
- static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
+ static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 1bfcb623..7bc9c869 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -24,7 +24,7 @@ namespace clblast {
template <typename T, typename U> const int TestBlas<T,U>::kSeed = 42; // fixed seed for reproducibility
// Test settings for the regular test. Append to these lists in case more tests are required.
-template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 4096 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 144, 4096 };
template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kIncrements = { 1, 2, 7 };
template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixDims = { 7, 64 };
template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 256 };