diff options
-rw-r--r-- | src/kernels/level1/xaxpy.opencl | 25 | ||||
-rw-r--r-- | src/routines/level1/xaxpy.cpp | 20 | ||||
-rw-r--r-- | src/tuning/kernels/xaxpy.cpp | 6 | ||||
-rw-r--r-- | test/correctness/testblas.cpp | 2 |
4 files changed, 40 insertions, 13 deletions
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index f44bbce0..d30d4e55 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -36,12 +36,31 @@ void Xaxpy(const int n, const real_arg arg_alpha, } } +// Faster version of the kernel without offsets and strided accesses but with if-statement. Also +// assumes that 'n' is dividable by 'VW' and 'WPT'. +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XaxpyFaster(const int n, const real_arg arg_alpha, + const __global realV* restrict xgm, + __global realV* ygm) { + const real alpha = GetRealArg(arg_alpha); + + if (get_global_id(0) < n / (VW)) { + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int id = w*get_global_size(0) + get_global_id(0); + realV xvalue = xgm[id]; + realV yvalue = ygm[id]; + ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue); + } + } +} + // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) -void XaxpyFast(const int n, const real_arg arg_alpha, - const __global realV* restrict xgm, - __global realV* ygm) { +void XaxpyFastest(const int n, const real_arg arg_alpha, + const __global realV* restrict xgm, + __global realV* ygm) { const real alpha = GetRealArg(arg_alpha); #pragma unroll diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp index 310562a0..0e588d99 100644 --- a/src/routines/level1/xaxpy.cpp +++ b/src/routines/level1/xaxpy.cpp @@ -44,18 +44,21 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha, TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used - const auto use_fast_kernel = (x_offset == 0) && (x_inc == 1) && - (y_offset == 0) && (y_inc == 1) && - IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WPT"]*db_["VW"]); + const auto use_fastest_kernel = use_faster_kernel && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); // If possible, run the fast-version of the kernel - const auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; + const auto kernel_name = (use_fastest_kernel) ? "XaxpyFastest" : + (use_faster_kernel) ? "XaxpyFaster" : "Xaxpy"; // Retrieves the Xaxpy kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments - if (use_fast_kernel) { + if (use_faster_kernel || use_fastest_kernel) { kernel.SetArgument(0, static_cast<int>(n)); kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); @@ -73,11 +76,16 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha, } // Launches the kernel - if (use_fast_kernel) { + if (use_fastest_kernel) { auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector<size_t>{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } + else if (use_faster_kernel) { + auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } else { const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index 23132c51..7984e184 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -27,7 +27,7 @@ class TuneXaxpy { // The representative kernel and the source code static std::string KernelFamily() { return "xaxpy"; } - static std::string KernelName() { return "XaxpyFast"; } + static std::string KernelName() { return "XaxpyFastest"; } static std::string GetSources() { return #include "../src/kernels/common.opencl" @@ -42,7 +42,7 @@ class TuneXaxpy { // Tests for valid arguments static void TestValidArguments(const Arguments<T> &args) { if (!IsMultiple(args.n, 64)) { - throw std::runtime_error("'XaxpyFast' requires 'n' to be a multiple of WGS*WPT*VW"); + throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW"); } } @@ -52,7 +52,7 @@ class TuneXaxpy { static size_t DefaultK() { return 1; } // N/A for this kernel static size_t DefaultBatchCount() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging + static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { return args.n; } diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index 1bfcb623..7bc9c869 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -24,7 +24,7 @@ namespace clblast { template <typename T, typename U> const int TestBlas<T,U>::kSeed = 42; // fixed seed for reproducibility // Test settings for the regular test. Append to these lists in case more tests are required. -template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 4096 }; +template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 144, 4096 }; template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kIncrements = { 1, 2, 7 }; template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixDims = { 7, 64 }; template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 256 }; |