summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-03-08 20:10:20 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-03-08 20:10:20 +0100
commitfa0a9c689fc21a2a24aeadf82ae0acdf6d8bf831 (patch)
tree404e85900a4c9038d407addb38798d06bb48868c /src
parent6aba0bbae71702c4eebd88d0fe17739b509185c1 (diff)
Make batched routines based on offsets instead of a vector of cl_mem objects - undoing many earlier changes
Diffstat (limited to 'src')
-rw-r--r--src/clblast.cpp36
-rw-r--r--src/clblast_c.cpp40
-rw-r--r--src/clpp11.hpp3
-rw-r--r--src/kernels/level1/xaxpy.opencl22
-rw-r--r--src/routines/levelx/xaxpybatched.cpp59
-rw-r--r--src/routines/levelx/xaxpybatched.hpp11
-rw-r--r--src/utilities/utilities.hpp6
7 files changed, 116 insertions, 61 deletions
diff --git a/src/clblast.cpp b/src/clblast.cpp
index e9cac664..d3db8edf 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -2178,57 +2178,57 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
template <typename T>
StatusCode AxpyBatched(const size_t n,
const T *alphas,
- const cl_mem *x_buffers, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_inc,
+ const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+ cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
try {
auto queue_cpp = Queue(*queue);
auto routine = XaxpyBatched<T>(queue_cpp, event);
auto alphas_cpp = std::vector<T>();
- auto x_buffers_cpp = std::vector<Buffer<T>>();
- auto y_buffers_cpp = std::vector<Buffer<T>>();
+ auto x_offsets_cpp = std::vector<size_t>();
+ auto y_offsets_cpp = std::vector<size_t>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(alphas[batch]);
- x_buffers_cpp.push_back(Buffer<T>(x_buffers[batch]));
- y_buffers_cpp.push_back(Buffer<T>(y_buffers[batch]));
+ x_offsets_cpp.push_back(x_offsets[batch]);
+ y_offsets_cpp.push_back(y_offsets[batch]);
}
routine.DoAxpyBatched(n,
alphas_cpp,
- x_buffers_cpp, x_inc,
- y_buffers_cpp, y_inc,
+ Buffer<T>(x_buffer), x_offsets_cpp, x_inc,
+ Buffer<T>(y_buffer), y_offsets_cpp, y_inc,
batch_count);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API AxpyBatched<float>(const size_t,
const float*,
- const cl_mem*, const size_t,
- cl_mem*, const size_t,
+ const cl_mem, const size_t*, const size_t,
+ cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<double>(const size_t,
const double*,
- const cl_mem*, const size_t,
- cl_mem*, const size_t,
+ const cl_mem, const size_t*, const size_t,
+ cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t,
const float2*,
- const cl_mem*, const size_t,
- cl_mem*, const size_t,
+ const cl_mem, const size_t*, const size_t,
+ cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t,
const double2*,
- const cl_mem*, const size_t,
- cl_mem*, const size_t,
+ const cl_mem, const size_t*, const size_t,
+ cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<half>(const size_t,
const half*,
- const cl_mem*, const size_t,
- cl_mem*, const size_t,
+ const cl_mem, const size_t*, const size_t,
+ cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp
index bd8ea51a..b09f8c54 100644
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@@ -3450,8 +3450,8 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran
// AXPY
CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
const float *alphas,
- const cl_mem *x_buffers, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_inc,
+ const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+ cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<float>();
@@ -3462,8 +3462,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
- x_buffers, x_inc,
- y_buffers, y_inc,
+ x_buffer, x_offsets, x_inc,
+ y_buffer, y_offsets, y_inc,
batch_count,
queue, event)
);
@@ -3471,8 +3471,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
}
CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
const double *alphas,
- const cl_mem *x_buffers, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_inc,
+ const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+ cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<double>();
@@ -3483,8 +3483,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
- x_buffers, x_inc,
- y_buffers, y_inc,
+ x_buffer, x_offsets, x_inc,
+ y_buffer, y_offsets, y_inc,
batch_count,
queue, event)
);
@@ -3492,8 +3492,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
}
CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
const cl_float2 *alphas,
- const cl_mem *x_buffers, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_inc,
+ const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+ cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<float2>();
@@ -3504,8 +3504,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
- x_buffers, x_inc,
- y_buffers, y_inc,
+ x_buffer, x_offsets, x_inc,
+ y_buffer, y_offsets, y_inc,
batch_count,
queue, event)
);
@@ -3513,8 +3513,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
}
CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
const cl_double2 *alphas,
- const cl_mem *x_buffers, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_inc,
+ const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+ cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<double2>();
@@ -3525,8 +3525,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
- x_buffers, x_inc,
- y_buffers, y_inc,
+ x_buffer, x_offsets, x_inc,
+ y_buffer, y_offsets, y_inc,
batch_count,
queue, event)
);
@@ -3534,8 +3534,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
}
CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
const cl_half *alphas,
- const cl_mem *x_buffers, const size_t x_inc,
- cl_mem *y_buffers, const size_t y_inc,
+ const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+ cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<half>();
@@ -3546,8 +3546,8 @@ CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
- x_buffers, x_inc,
- y_buffers, y_inc,
+ x_buffer, x_offsets, x_inc,
+ y_buffer, y_offsets, y_inc,
batch_count,
queue, event)
);
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 41af28da..29f81cf8 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -600,9 +600,6 @@ class Buffer {
// Copies from host to device: writing the device buffer a-synchronously
void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
- if (access_ == BufferAccess::kReadOnly) {
- throw LogicError("Buffer: writing to a read-only buffer");
- }
if (GetSize() < (offset+size)*sizeof(T)) {
throw LogicError("Buffer: target device buffer is too small");
}
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index ece8476e..0d730c9e 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -9,7 +9,7 @@
//
// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
-// support vector data-types.
+// support vector data-types. The general version has a batched implementation as well.
//
// This kernel uses the level-1 BLAS common tuning parameters.
//
@@ -36,8 +36,6 @@ void Xaxpy(const int n, const real_arg arg_alpha,
}
}
-// =================================================================================================
-
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
@@ -57,6 +55,24 @@ void XaxpyFast(const int n, const real_arg arg_alpha,
// =================================================================================================
+// Full version of the kernel with offsets and strided accesses: batched version
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyBatched(const int n, const real_arg arg_alpha,
+ const __global real* restrict xgm, const int x_offset, const int x_inc,
+ __global real* ygm, const int y_offset, const int y_inc,
+ const int batch) {
+ const real alpha = GetRealArg(arg_alpha);
+
+ // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+ #pragma unroll
+ for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+ real xvalue = xgm[id*x_inc + x_offset];
+ MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xvalue);
+ }
+}
+
+// =================================================================================================
+
// End of the C++11 raw string literal
)"
diff --git a/src/routines/levelx/xaxpybatched.cpp b/src/routines/levelx/xaxpybatched.cpp
index 55458f43..8089cdc6 100644
--- a/src/routines/levelx/xaxpybatched.cpp
+++ b/src/routines/levelx/xaxpybatched.cpp
@@ -22,7 +22,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::string &name):
- Xaxpy<T>(queue, event, name) {
+ Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
+ #include "../../kernels/level1/level1.opencl"
+ #include "../../kernels/level1/xaxpy.opencl"
+ }) {
}
// =================================================================================================
@@ -30,19 +33,55 @@ XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::strin
// The main routine
template <typename T>
void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
- const std::vector<Buffer<T>> &x_buffers, const size_t x_inc,
- const std::vector<Buffer<T>> &y_buffers, const size_t y_inc,
+ const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc,
+ const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc,
const size_t batch_count) {
- if (batch_count < 1) { throw BLASError(StatusCode::kInvalidBatchCount); }
- if (alphas.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
- if (x_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
- if (y_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
+
+ // Tests for a valid batch count
+ if ((batch_count < 1) || (alphas.size() != batch_count) ||
+ (x_offsets.size() != batch_count) || (y_offsets.size() != batch_count)) {
+ throw BLASError(StatusCode::kInvalidBatchCount);
+ }
+
+ // Makes sure all dimensions are larger than zero
+ if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
+
+ // Tests the vectors for validity
+ for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+ TestVectorX(n, x_buffer, x_offsets[batch], x_inc);
+ TestVectorY(n, y_buffer, y_offsets[batch], y_inc);
+ }
+
+ // Upload the arguments to the device
+ std::vector<int> x_offsets_int(x_offsets.begin(), x_offsets.end());
+ std::vector<int> y_offsets_int(y_offsets.begin(), y_offsets.end());
+ auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+ auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+ x_offsets_device.Write(queue_, batch_count, x_offsets_int);
+ y_offsets_device.Write(queue_, batch_count, y_offsets_int);
+
+ // Retrieves the Xaxpy kernel from the compiled binary
+ auto kernel = Kernel(program_, "XaxpyBatched");
// Naive implementation: calls regular Axpy multiple times
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
- DoAxpy(n, alphas[batch],
- x_buffers[batch], 0, x_inc,
- y_buffers[batch], 0, y_inc);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, GetRealArg(alphas[batch]));
+ kernel.SetArgument(2, x_buffer());
+ kernel.SetArgument(3, static_cast<int>(x_offsets[batch]));
+ kernel.SetArgument(4, static_cast<int>(x_inc));
+ kernel.SetArgument(5, y_buffer());
+ kernel.SetArgument(6, static_cast<int>(y_offsets[batch]));
+ kernel.SetArgument(7, static_cast<int>(y_inc));
+ kernel.SetArgument(8, static_cast<int>(batch));
+
+ // Launches the kernel
+ auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+ auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+ auto local = std::vector<size_t>{db_["WGS"]};
+ RunKernel(kernel, queue_, device_, global, local, event_);
}
}
diff --git a/src/routines/levelx/xaxpybatched.hpp b/src/routines/levelx/xaxpybatched.hpp
index 7fd14a74..513792ea 100644
--- a/src/routines/levelx/xaxpybatched.hpp
+++ b/src/routines/levelx/xaxpybatched.hpp
@@ -16,26 +16,23 @@
#include <vector>
-#include "routines/level1/xaxpy.hpp"
+#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
-class XaxpyBatched: public Xaxpy<T> {
+class XaxpyBatched: public Routine {
public:
- // Uses the regular Xaxpy routine
- using Xaxpy<T>::DoAxpy;
-
// Constructor
XaxpyBatched(Queue &queue, EventPointer event, const std::string &name = "AXPYBATCHED");
// Templated-precision implementation of the routine
void DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
- const std::vector<Buffer<T>> &x_buffers, const size_t x_inc,
- const std::vector<Buffer<T>> &y_buffers, const size_t y_inc,
+ const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc,
+ const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc,
const size_t batch_count);
};
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 851fa251..d271ffee 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -157,7 +157,13 @@ struct Arguments {
size_t imax_offset = 0;
T alpha = ConstantOne<T>();
T beta = ConstantOne<T>();
+ // Batch-specific arguments
size_t batch_count = 1;
+ std::vector<size_t> x_offsets = {0};
+ std::vector<size_t> y_offsets = {0};
+ std::vector<size_t> a_offsets = {0};
+ std::vector<size_t> b_offsets = {0};
+ std::vector<size_t> c_offsets = {0};
// Sizes
size_t x_size = 1;
size_t y_size = 1;