From 10205d773e1477fdd634dbc7e224cc71361a9885 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 14 Apr 2017 20:16:10 +0200 Subject: Added a new Xaxpy kernel in between the regular and fast version in --- src/kernels/level1/xaxpy.opencl | 25 ++++++++++++++++++++++--- src/routines/level1/xaxpy.cpp | 20 ++++++++++++++------ src/tuning/kernels/xaxpy.cpp | 6 +++--- test/correctness/testblas.cpp | 2 +- 4 files changed, 40 insertions(+), 13 deletions(-) diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index f44bbce0..d30d4e55 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -36,12 +36,31 @@ void Xaxpy(const int n, const real_arg arg_alpha, } } +// Faster version of the kernel without offsets and strided accesses but with if-statement. Also +// assumes that 'n' is dividable by 'VW' and 'WPT'. +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XaxpyFaster(const int n, const real_arg arg_alpha, + const __global realV* restrict xgm, + __global realV* ygm) { + const real alpha = GetRealArg(arg_alpha); + + if (get_global_id(0) < n / (VW)) { + #pragma unroll + for (int w=0; w::DoAxpy(const size_t n, const T alpha, TestVectorY(n, y_buffer, y_offset, y_inc); // Determines whether or not the fast-version can be used - const auto use_fast_kernel = (x_offset == 0) && (x_inc == 1) && - (y_offset == 0) && (y_inc == 1) && - IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WPT"]*db_["VW"]); + const auto use_fastest_kernel = use_faster_kernel && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); // If possible, run the fast-version of the kernel - const auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; + const auto kernel_name = (use_fastest_kernel) ? "XaxpyFastest" : + (use_faster_kernel) ? "XaxpyFaster" : "Xaxpy"; // Retrieves the Xaxpy kernel from the compiled binary auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments - if (use_fast_kernel) { + if (use_faster_kernel || use_fastest_kernel) { kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); @@ -73,11 +76,16 @@ void Xaxpy::DoAxpy(const size_t n, const T alpha, } // Launches the kernel - if (use_fast_kernel) { + if (use_fastest_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; RunKernel(kernel, queue_, device_, global, local, event_); } + else if (use_faster_kernel) { + auto global = std::vector{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])}; + auto local = std::vector{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); + } else { const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index 23132c51..7984e184 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -27,7 +27,7 @@ class TuneXaxpy { // The representative kernel and the source code static std::string KernelFamily() { return "xaxpy"; } - static std::string KernelName() { return "XaxpyFast"; } + static std::string KernelName() { return "XaxpyFastest"; } static std::string GetSources() { return #include "../src/kernels/common.opencl" @@ -42,7 +42,7 @@ class TuneXaxpy { // Tests for valid arguments static void TestValidArguments(const Arguments &args) { if (!IsMultiple(args.n, 64)) { - throw std::runtime_error("'XaxpyFast' requires 'n' to be a multiple of WGS*WPT*VW"); + throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW"); } } @@ -52,7 +52,7 @@ class TuneXaxpy { static size_t DefaultK() { return 1; } // N/A for this kernel static size_t DefaultBatchCount() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging + static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n; } diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index 1bfcb623..7bc9c869 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -24,7 +24,7 @@ namespace clblast { template const int TestBlas::kSeed = 42; // fixed seed for reproducibility // Test settings for the regular test. Append to these lists in case more tests are required. -template const std::vector TestBlas::kVectorDims = { 7, 93, 4096 }; +template const std::vector TestBlas::kVectorDims = { 7, 93, 144, 4096 }; template const std::vector TestBlas::kIncrements = { 1, 2, 7 }; template const std::vector TestBlas::kMatrixDims = { 7, 64 }; template const std::vector TestBlas::kMatrixVectorDims = { 61, 256 }; -- cgit v1.2.3