diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-03-08 20:10:20 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-03-08 20:10:20 +0100 |
commit | fa0a9c689fc21a2a24aeadf82ae0acdf6d8bf831 (patch) | |
tree | 404e85900a4c9038d407addb38798d06bb48868c /src | |
parent | 6aba0bbae71702c4eebd88d0fe17739b509185c1 (diff) |
Make batched routines based on offsets instead of a vector of cl_mem objects - undoing many earlier changes
Diffstat (limited to 'src')
-rw-r--r-- | src/clblast.cpp | 36 | ||||
-rw-r--r-- | src/clblast_c.cpp | 40 | ||||
-rw-r--r-- | src/clpp11.hpp | 3 | ||||
-rw-r--r-- | src/kernels/level1/xaxpy.opencl | 22 | ||||
-rw-r--r-- | src/routines/levelx/xaxpybatched.cpp | 59 | ||||
-rw-r--r-- | src/routines/levelx/xaxpybatched.hpp | 11 | ||||
-rw-r--r-- | src/utilities/utilities.hpp | 6 |
7 files changed, 116 insertions, 61 deletions
diff --git a/src/clblast.cpp b/src/clblast.cpp index e9cac664..d3db8edf 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -2178,57 +2178,57 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose, template <typename T> StatusCode AxpyBatched(const size_t n, const T *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = XaxpyBatched<T>(queue_cpp, event); auto alphas_cpp = std::vector<T>(); - auto x_buffers_cpp = std::vector<Buffer<T>>(); - auto y_buffers_cpp = std::vector<Buffer<T>>(); + auto x_offsets_cpp = std::vector<size_t>(); + auto y_offsets_cpp = std::vector<size_t>(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); - x_buffers_cpp.push_back(Buffer<T>(x_buffers[batch])); - y_buffers_cpp.push_back(Buffer<T>(y_buffers[batch])); + x_offsets_cpp.push_back(x_offsets[batch]); + y_offsets_cpp.push_back(y_offsets[batch]); } routine.DoAxpyBatched(n, alphas_cpp, - x_buffers_cpp, x_inc, - y_buffers_cpp, y_inc, + Buffer<T>(x_buffer), x_offsets_cpp, x_inc, + Buffer<T>(y_buffer), y_offsets_cpp, y_inc, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API AxpyBatched<float>(const size_t, const float*, - const cl_mem*, const size_t, - cl_mem*, const size_t, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched<double>(const size_t, const double*, - const cl_mem*, const size_t, - cl_mem*, const size_t, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t, const float2*, - const cl_mem*, const size_t, - cl_mem*, const size_t, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t, const double2*, - const cl_mem*, const size_t, - cl_mem*, const size_t, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched<half>(const size_t, const half*, - const cl_mem*, const size_t, - cl_mem*, const size_t, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp index bd8ea51a..b09f8c54 100644 --- a/src/clblast_c.cpp +++ b/src/clblast_c.cpp @@ -3450,8 +3450,8 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran // AXPY CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, const float *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector<float>(); @@ -3462,8 +3462,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, return static_cast<CLBlastStatusCode>( clblast::AxpyBatched(n, alphas_cpp.data(), - x_buffers, x_inc, - y_buffers, y_inc, + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, batch_count, queue, event) ); @@ -3471,8 +3471,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, } CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, const double *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector<double>(); @@ -3483,8 +3483,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, return static_cast<CLBlastStatusCode>( clblast::AxpyBatched(n, alphas_cpp.data(), - x_buffers, x_inc, - y_buffers, y_inc, + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, batch_count, queue, event) ); @@ -3492,8 +3492,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, } CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, const cl_float2 *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector<float2>(); @@ -3504,8 +3504,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, return static_cast<CLBlastStatusCode>( clblast::AxpyBatched(n, alphas_cpp.data(), - x_buffers, x_inc, - y_buffers, y_inc, + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, batch_count, queue, event) ); @@ -3513,8 +3513,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, } CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, const cl_double2 *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector<double2>(); @@ -3525,8 +3525,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, return static_cast<CLBlastStatusCode>( clblast::AxpyBatched(n, alphas_cpp.data(), - x_buffers, x_inc, - y_buffers, y_inc, + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, batch_count, queue, event) ); @@ -3534,8 +3534,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, } CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, const cl_half *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector<half>(); @@ -3546,8 +3546,8 @@ CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, return static_cast<CLBlastStatusCode>( clblast::AxpyBatched(n, alphas_cpp.data(), - x_buffers, x_inc, - y_buffers, y_inc, + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, batch_count, queue, event) ); diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 41af28da..29f81cf8 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -600,9 +600,6 @@ class Buffer { // Copies from host to device: writing the device buffer a-synchronously void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { - if (access_ == BufferAccess::kReadOnly) { - throw LogicError("Buffer: writing to a read-only buffer"); - } if (GetSize() < (offset+size)*sizeof(T)) { throw LogicError("Buffer: target device buffer is too small"); } diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index ece8476e..0d730c9e 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -9,7 +9,7 @@ // // This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't -// support vector data-types. +// support vector data-types. The general version has a batched implementation as well. // // This kernel uses the level-1 BLAS common tuning parameters. // @@ -36,8 +36,6 @@ void Xaxpy(const int n, const real_arg arg_alpha, } } -// ================================================================================================= - // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) @@ -57,6 +55,24 @@ void XaxpyFast(const int n, const real_arg arg_alpha, // ================================================================================================= +// Full version of the kernel with offsets and strided accesses: batched version +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XaxpyBatched(const int n, const real_arg arg_alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc, + const int batch) { + const real alpha = GetRealArg(arg_alpha); + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id<n; id += get_global_size(0)) { + real xvalue = xgm[id*x_inc + x_offset]; + MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xvalue); + } +} + +// ================================================================================================= + // End of the C++11 raw string literal )" diff --git a/src/routines/levelx/xaxpybatched.cpp b/src/routines/levelx/xaxpybatched.cpp index 55458f43..8089cdc6 100644 --- a/src/routines/levelx/xaxpybatched.cpp +++ b/src/routines/levelx/xaxpybatched.cpp @@ -22,7 +22,10 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::string &name): - Xaxpy<T>(queue, event, name) { + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, { + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xaxpy.opencl" + }) { } // ================================================================================================= @@ -30,19 +33,55 @@ XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::strin // The main routine template <typename T> void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas, - const std::vector<Buffer<T>> &x_buffers, const size_t x_inc, - const std::vector<Buffer<T>> &y_buffers, const size_t y_inc, + const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc, + const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc, const size_t batch_count) { - if (batch_count < 1) { throw BLASError(StatusCode::kInvalidBatchCount); } - if (alphas.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); } - if (x_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); } - if (y_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); } + + // Tests for a valid batch count + if ((batch_count < 1) || (alphas.size() != batch_count) || + (x_offsets.size() != batch_count) || (y_offsets.size() != batch_count)) { + throw BLASError(StatusCode::kInvalidBatchCount); + } + + // Makes sure all dimensions are larger than zero + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } + + // Tests the vectors for validity + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + TestVectorX(n, x_buffer, x_offsets[batch], x_inc); + TestVectorY(n, y_buffer, y_offsets[batch], y_inc); + } + + // Upload the arguments to the device + std::vector<int> x_offsets_int(x_offsets.begin(), x_offsets.end()); + std::vector<int> y_offsets_int(y_offsets.begin(), y_offsets.end()); + auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + x_offsets_device.Write(queue_, batch_count, x_offsets_int); + y_offsets_device.Write(queue_, batch_count, y_offsets_int); + + // Retrieves the Xaxpy kernel from the compiled binary + auto kernel = Kernel(program_, "XaxpyBatched"); // Naive implementation: calls regular Axpy multiple times for (auto batch = size_t{0}; batch < batch_count; ++batch) { - DoAxpy(n, alphas[batch], - x_buffers[batch], 0, x_inc, - y_buffers[batch], 0, y_inc); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, GetRealArg(alphas[batch])); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offsets[batch])); + kernel.SetArgument(4, static_cast<int>(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast<int>(y_offsets[batch])); + kernel.SetArgument(7, static_cast<int>(y_inc)); + kernel.SetArgument(8, static_cast<int>(batch)); + + // Launches the kernel + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; + auto local = std::vector<size_t>{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); } } diff --git a/src/routines/levelx/xaxpybatched.hpp b/src/routines/levelx/xaxpybatched.hpp index 7fd14a74..513792ea 100644 --- a/src/routines/levelx/xaxpybatched.hpp +++ b/src/routines/levelx/xaxpybatched.hpp @@ -16,26 +16,23 @@ #include <vector> -#include "routines/level1/xaxpy.hpp" +#include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template <typename T> -class XaxpyBatched: public Xaxpy<T> { +class XaxpyBatched: public Routine { public: - // Uses the regular Xaxpy routine - using Xaxpy<T>::DoAxpy; - // Constructor XaxpyBatched(Queue &queue, EventPointer event, const std::string &name = "AXPYBATCHED"); // Templated-precision implementation of the routine void DoAxpyBatched(const size_t n, const std::vector<T> &alphas, - const std::vector<Buffer<T>> &x_buffers, const size_t x_inc, - const std::vector<Buffer<T>> &y_buffers, const size_t y_inc, + const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc, + const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc, const size_t batch_count); }; diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index 851fa251..d271ffee 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -157,7 +157,13 @@ struct Arguments { size_t imax_offset = 0; T alpha = ConstantOne<T>(); T beta = ConstantOne<T>(); + // Batch-specific arguments size_t batch_count = 1; + std::vector<size_t> x_offsets = {0}; + std::vector<size_t> y_offsets = {0}; + std::vector<size_t> a_offsets = {0}; + std::vector<size_t> b_offsets = {0}; + std::vector<size_t> c_offsets = {0}; // Sizes size_t x_size = 1; size_t y_size = 1; |