diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/clblast.cpp | 59 | ||||
-rw-r--r-- | src/clblast_c.cpp | 107 | ||||
-rw-r--r-- | src/clpp11.hpp | 3 | ||||
-rw-r--r-- | src/kernels/level1/xaxpy.opencl | 22 | ||||
-rw-r--r-- | src/routines/levelx/xaxpybatched.cpp | 95 | ||||
-rw-r--r-- | src/routines/levelx/xaxpybatched.hpp | 43 | ||||
-rw-r--r-- | src/tuning/tuning.hpp | 15 | ||||
-rw-r--r-- | src/utilities/utilities.cpp | 41 | ||||
-rw-r--r-- | src/utilities/utilities.hpp | 14 |
9 files changed, 359 insertions, 40 deletions
diff --git a/src/clblast.cpp b/src/clblast.cpp index a63d766c..d3db8edf 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -71,6 +71,7 @@ // Level-x includes (non-BLAS) #include "routines/levelx/xomatcopy.hpp" +#include "routines/levelx/xaxpybatched.hpp" namespace clblast { @@ -2172,6 +2173,64 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); + +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +template <typename T> +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + try { + auto queue_cpp = Queue(*queue); + auto routine = XaxpyBatched<T>(queue_cpp, event); + auto alphas_cpp = std::vector<T>(); + auto x_offsets_cpp = std::vector<size_t>(); + auto y_offsets_cpp = std::vector<size_t>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + x_offsets_cpp.push_back(x_offsets[batch]); + y_offsets_cpp.push_back(y_offsets[batch]); + } + routine.DoAxpyBatched(n, + alphas_cpp, + Buffer<T>(x_buffer), x_offsets_cpp, x_inc, + Buffer<T>(y_buffer), y_offsets_cpp, y_inc, + batch_count); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API AxpyBatched<float>(const size_t, + const float*, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, + const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API AxpyBatched<double>(const size_t, + const double*, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, + const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t, + const float2*, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, + const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t, + const double2*, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, + const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API AxpyBatched<half>(const size_t, + const half*, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, + const size_t, + cl_command_queue*, cl_event*); // ================================================================================================= // Clears the cache of stored binaries diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp index 6018bcfa..b09f8c54 100644 --- a/src/clblast_c.cpp +++ b/src/clblast_c.cpp @@ -3447,6 +3447,113 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } } +// AXPY +CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, + const float *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + auto alphas_cpp = std::vector<float>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + } + try { + return static_cast<CLBlastStatusCode>( + clblast::AxpyBatched(n, + alphas_cpp.data(), + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, + batch_count, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, + const double *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + auto alphas_cpp = std::vector<double>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + } + try { + return static_cast<CLBlastStatusCode>( + clblast::AxpyBatched(n, + alphas_cpp.data(), + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, + batch_count, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, + const cl_float2 *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + auto alphas_cpp = std::vector<float2>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(float2{alphas[batch].s[0], alphas[batch].s[1]}); + } + try { + return static_cast<CLBlastStatusCode>( + clblast::AxpyBatched(n, + alphas_cpp.data(), + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, + batch_count, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, + const cl_double2 *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + auto alphas_cpp = std::vector<double2>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(double2{alphas[batch].s[0], alphas[batch].s[1]}); + } + try { + return static_cast<CLBlastStatusCode>( + clblast::AxpyBatched(n, + alphas_cpp.data(), + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, + batch_count, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, + const cl_half *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + auto alphas_cpp = std::vector<half>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + } + try { + return static_cast<CLBlastStatusCode>( + clblast::AxpyBatched(n, + alphas_cpp.data(), + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, + batch_count, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} + // ================================================================================================= // Clears the cache of stored binaries diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 41af28da..29f81cf8 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -600,9 +600,6 @@ class Buffer { // Copies from host to device: writing the device buffer a-synchronously void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { - if (access_ == BufferAccess::kReadOnly) { - throw LogicError("Buffer: writing to a read-only buffer"); - } if (GetSize() < (offset+size)*sizeof(T)) { throw LogicError("Buffer: target device buffer is too small"); } diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index ece8476e..f44bbce0 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -9,7 +9,7 @@ // // This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't -// support vector data-types. +// support vector data-types. The general version has a batched implementation as well. // // This kernel uses the level-1 BLAS common tuning parameters. // @@ -36,8 +36,6 @@ void Xaxpy(const int n, const real_arg arg_alpha, } } -// ================================================================================================= - // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) @@ -57,6 +55,24 @@ void XaxpyFast(const int n, const real_arg arg_alpha, // ================================================================================================= +// Full version of the kernel with offsets and strided accesses: batched version +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XaxpyBatched(const int n, const __constant real_arg* arg_alphas, + const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc, + __global real* ygm, const __constant int* y_offsets, const int y_inc) { + const int batch = get_group_id(1); + const real alpha = GetRealArg(arg_alphas[batch]); + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id<n; id += get_global_size(0)) { + real xvalue = xgm[id*x_inc + x_offsets[batch]]; + MultiplyAdd(ygm[id*y_inc + y_offsets[batch]], alpha, xvalue); + } +} + +// ================================================================================================= + // End of the C++11 raw string literal )" diff --git a/src/routines/levelx/xaxpybatched.cpp b/src/routines/levelx/xaxpybatched.cpp new file mode 100644 index 00000000..6a4269be --- /dev/null +++ b/src/routines/levelx/xaxpybatched.cpp @@ -0,0 +1,95 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the XaxpyBatched class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/levelx/xaxpybatched.hpp" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, { + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xaxpy.opencl" + }) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas, + const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc, + const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc, + const size_t batch_count) { + + // Tests for a valid batch count + if ((batch_count < 1) || (alphas.size() != batch_count) || + (x_offsets.size() != batch_count) || (y_offsets.size() != batch_count)) { + throw BLASError(StatusCode::kInvalidBatchCount); + } + + // Makes sure all dimensions are larger than zero + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } + + // Tests the vectors for validity + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + TestVectorX(n, x_buffer, x_offsets[batch], x_inc); + TestVectorY(n, y_buffer, y_offsets[batch], y_inc); + } + + // Upload the arguments to the device + std::vector<int> x_offsets_int(x_offsets.begin(), x_offsets.end()); + std::vector<int> y_offsets_int(y_offsets.begin(), y_offsets.end()); + auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count); + x_offsets_device.Write(queue_, batch_count, x_offsets_int); + y_offsets_device.Write(queue_, batch_count, y_offsets_int); + alphas_device.Write(queue_, batch_count, alphas); + + // Retrieves the Xaxpy kernel from the compiled binary + auto kernel = Kernel(program_, "XaxpyBatched"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, alphas_device()); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, x_offsets_device()); + kernel.SetArgument(4, static_cast<int>(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, y_offsets_device()); + kernel.SetArgument(7, static_cast<int>(y_inc)); + + // Launches the kernel + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"], batch_count}; + auto local = std::vector<size_t>{db_["WGS"], 1}; + RunKernel(kernel, queue_, device_, global, local, event_); +} + +// ================================================================================================= + +// Compiles the templated class +template class XaxpyBatched<half>; +template class XaxpyBatched<float>; +template class XaxpyBatched<double>; +template class XaxpyBatched<float2>; +template class XaxpyBatched<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/levelx/xaxpybatched.hpp b/src/routines/levelx/xaxpybatched.hpp new file mode 100644 index 00000000..513792ea --- /dev/null +++ b/src/routines/levelx/xaxpybatched.hpp @@ -0,0 +1,43 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the XaxpyBatched routine. This is a non-blas batched version of AXPY. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XAXPYBATCHED_H_ +#define CLBLAST_ROUTINES_XAXPYBATCHED_H_ + +#include <vector> + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class XaxpyBatched: public Routine { + public: + + // Constructor + XaxpyBatched(Queue &queue, EventPointer event, const std::string &name = "AXPYBATCHED"); + + // Templated-precision implementation of the routine + void DoAxpyBatched(const size_t n, const std::vector<T> &alphas, + const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc, + const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc, + const size_t batch_count); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XAXPYBATCHED_H_ +#endif diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index 1dd76894..7060fc9f 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -17,6 +17,7 @@ #include <vector> #include <string> +#include <random> #include <cltune.h> @@ -77,12 +78,14 @@ void Tuner(int argc, char* argv[]) { auto b_mat = std::vector<T>(C::GetSizeB(args)); auto c_mat = std::vector<T>(C::GetSizeC(args)); auto temp = std::vector<T>(C::GetSizeTemp(args)); - PopulateVector(x_vec, kSeed); - PopulateVector(y_vec, kSeed); - PopulateVector(a_mat, kSeed); - PopulateVector(b_mat, kSeed); - PopulateVector(c_mat, kSeed); - PopulateVector(temp, kSeed); + std::mt19937 mt(kSeed); + std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit); + PopulateVector(x_vec, mt, dist); + PopulateVector(y_vec, mt, dist); + PopulateVector(a_mat, mt, dist); + PopulateVector(b_mat, mt, dist); + PopulateVector(c_mat, mt, dist); + PopulateVector(temp, mt, dist); // Initializes the tuner for the chosen device cltune::Tuner tuner(args.platform_id, args.device_id); diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index d68cc1a6..3d091b64 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -67,8 +67,8 @@ template <> double2 Constant(const double val) { return {val, 0.0}; } template <typename T> T SmallConstant() { return static_cast<T>(1e-4); } template float SmallConstant<float>(); template double SmallConstant<double>(); -template <> half SmallConstant() { return FloatToHalf(1e-4); } -template <> float2 SmallConstant() { return {1e-4, 0.0f}; } +template <> half SmallConstant() { return FloatToHalf(1e-4f); } +template <> float2 SmallConstant() { return {1e-4f, 0.0f}; } template <> double2 SmallConstant() { return {1e-4, 0.0}; } // Returns the absolute value of a scalar (modulus in case of a complex number) @@ -326,42 +326,29 @@ unsigned int GetRandomSeed() { // Create a random number generator and populates a vector with samples from a random distribution template <typename T> -void PopulateVector(std::vector<T> &vector, const unsigned int seed) { - auto lower_limit = static_cast<T>(kTestDataLowerLimit); - auto upper_limit = static_cast<T>(kTestDataUpperLimit); - std::mt19937 mt(seed); - std::uniform_real_distribution<T> dist(lower_limit, upper_limit); - for (auto &element: vector) { element = dist(mt); } +void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) { + for (auto &element: vector) { element = static_cast<T>(dist(mt)); } } -template void PopulateVector<float>(std::vector<float>&, const unsigned int); -template void PopulateVector<double>(std::vector<double>&, const unsigned int); +template void PopulateVector<float>(std::vector<float>&, std::mt19937&, std::uniform_real_distribution<double>&); +template void PopulateVector<double>(std::vector<double>&, std::mt19937&, std::uniform_real_distribution<double>&); // Specialized versions of the above for complex data-types template <> -void PopulateVector(std::vector<float2> &vector, const unsigned int seed) { - auto lower_limit = static_cast<float>(kTestDataLowerLimit); - auto upper_limit = static_cast<float>(kTestDataUpperLimit); - std::mt19937 mt(seed); - std::uniform_real_distribution<float> dist(lower_limit, upper_limit); - for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } +void PopulateVector(std::vector<float2> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) { + for (auto &element: vector) { + element.real(static_cast<float>(dist(mt))); + element.imag(static_cast<float>(dist(mt))); + } } template <> -void PopulateVector(std::vector<double2> &vector, const unsigned int seed) { - auto lower_limit = static_cast<double>(kTestDataLowerLimit); - auto upper_limit = static_cast<double>(kTestDataUpperLimit); - std::mt19937 mt(seed); - std::uniform_real_distribution<double> dist(lower_limit, upper_limit); +void PopulateVector(std::vector<double2> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) { for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } } // Specialized versions of the above for half-precision template <> -void PopulateVector(std::vector<half> &vector, const unsigned int seed) { - const auto lower_limit = static_cast<float>(kTestDataLowerLimit); - const auto upper_limit = static_cast<float>(kTestDataUpperLimit); - std::mt19937 mt(seed); - std::uniform_real_distribution<float> dist(lower_limit, upper_limit); - for (auto &element: vector) { element = FloatToHalf(dist(mt)); } +void PopulateVector(std::vector<half> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) { + for (auto &element: vector) { element = FloatToHalf(static_cast<float>(dist(mt))); } } // ================================================================================================= diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index 3c9be6a2..b3db8c22 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -20,6 +20,7 @@ #include <string> #include <functional> #include <complex> +#include <random> #include "clpp11.hpp" #include "clblast.h" @@ -72,6 +73,7 @@ constexpr auto kArgAsumOffset = "offasum"; constexpr auto kArgImaxOffset = "offimax"; constexpr auto kArgAlpha = "alpha"; constexpr auto kArgBeta = "beta"; +constexpr auto kArgBatchCount = "batch_num"; // The tuner-specific arguments in string form constexpr auto kArgFraction = "fraction"; @@ -155,6 +157,16 @@ struct Arguments { size_t imax_offset = 0; T alpha = ConstantOne<T>(); T beta = ConstantOne<T>(); + // Batch-specific arguments + size_t batch_count = 1; + std::vector<size_t> x_offsets = {0}; + std::vector<size_t> y_offsets = {0}; + std::vector<size_t> a_offsets = {0}; + std::vector<size_t> b_offsets = {0}; + std::vector<size_t> c_offsets = {0}; + std::vector<T> alphas = {ConstantOne<T>()}; + std::vector<T> betas = {ConstantOne<T>()}; + // Sizes size_t x_size = 1; size_t y_size = 1; size_t a_size = 1; @@ -234,7 +246,7 @@ constexpr auto kTestDataUpperLimit = 2.0; // Populates a vector with random data template <typename T> -void PopulateVector(std::vector<T> &vector, const unsigned int seed); +void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist); // ================================================================================================= |