4 files changed, 40 insertions, 13 deletions
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index f44bbce0..d30d4e55 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -36,12 +36,31 @@ void Xaxpy(const int n, const real_arg arg_alpha,
   }
 }
 
+// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
+// assumes that 'n' is dividable by 'VW' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFaster(const int n, const real_arg arg_alpha,
+                 const __global realV* restrict xgm,
+                 __global realV* ygm) {
+  const real alpha = GetRealArg(arg_alpha);
+
+  if (get_global_id(0) < n / (VW)) {
+    #pragma unroll
+    for (int w=0; w<WPT; ++w) {
+      const int id = w*get_global_size(0) + get_global_id(0);
+      realV xvalue = xgm[id];
+      realV yvalue = ygm[id];
+      ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);
+    }
+  }
+}
+
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
 __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
-void XaxpyFast(const int n, const real_arg arg_alpha,
-               const __global realV* restrict xgm,
-               __global realV* ygm) {
+void XaxpyFastest(const int n, const real_arg arg_alpha,
+                  const __global realV* restrict xgm,
+                  __global realV* ygm) {
   const real alpha = GetRealArg(arg_alpha);
 
   #pragma unroll
diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp
index 310562a0..0e588d99 100644
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@@ -44,18 +44,21 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
   TestVectorY(n, y_buffer, y_offset, y_inc);
 
   // Determines whether or not the fast-version can be used
-  const auto use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
-                               (y_offset == 0) && (y_inc == 1) &&
-                               IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+  const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) &&
+                                 (y_offset == 0) && (y_inc == 1) &&
+                                 IsMultiple(n, db_["WPT"]*db_["VW"]);
+  const auto use_fastest_kernel = use_faster_kernel &&
+                                  IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
 
   // If possible, run the fast-version of the kernel
-  const auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
+  const auto kernel_name = (use_fastest_kernel) ? "XaxpyFastest" :
+                           (use_faster_kernel) ? "XaxpyFaster" : "Xaxpy";
 
   // Retrieves the Xaxpy kernel from the compiled binary
   auto kernel = Kernel(program_, kernel_name);
 
   // Sets the kernel arguments
-  if (use_fast_kernel) {
+  if (use_faster_kernel || use_fastest_kernel) {
     kernel.SetArgument(0, static_cast<int>(n));
     kernel.SetArgument(1, GetRealArg(alpha));
     kernel.SetArgument(2, x_buffer());
@@ -73,11 +76,16 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
   }
 
   // Launches the kernel
-  if (use_fast_kernel) {
+  if (use_fastest_kernel) {
     auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
     auto local = std::vector<size_t>{db_["WGS"]};
     RunKernel(kernel, queue_, device_, global, local, event_);
   }
+  else if (use_faster_kernel) {
+    auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
   else {
     const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
     auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 23132c51..7984e184 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -27,7 +27,7 @@ class TuneXaxpy {
 
   // The representative kernel and the source code
   static std::string KernelFamily() { return "xaxpy"; }
-  static std::string KernelName() { return "XaxpyFast"; }
+  static std::string KernelName() { return "XaxpyFastest"; }
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
@@ -42,7 +42,7 @@ class TuneXaxpy {
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &args) {
     if (!IsMultiple(args.n, 64)) {
-      throw std::runtime_error("'XaxpyFast' requires 'n' to be a multiple of WGS*WPT*VW");
+      throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW");
     }
   }
 
@@ -52,7 +52,7 @@ class TuneXaxpy {
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
+  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 1bfcb623..7bc9c869 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -24,7 +24,7 @@ namespace clblast {
 template <typename T, typename U> const int TestBlas<T,U>::kSeed = 42; // fixed seed for reproducibility
 
 // Test settings for the regular test. Append to these lists in case more tests are required.
-template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 4096 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 144, 4096 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kIncrements = { 1, 2, 7 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixDims = { 7, 64 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 256 };