diff options
author | CNugteren <web@cedricnugteren.nl> | 2015-05-30 12:30:43 +0200 |
---|---|---|
committer | CNugteren <web@cedricnugteren.nl> | 2015-05-30 12:30:43 +0200 |
commit | bc5a341dfe591946e925db315fc7d8c0c25c2938 (patch) | |
tree | b216ab5eee4863e3807d92b5ddd19fa22197ed22 /src | |
parent | c7b054ea6747039f4405fd93da6e924f3e5c7f4b (diff) |
Initial commit of preview version
Diffstat (limited to 'src')
-rw-r--r-- | src/clblast.cc | 224 | ||||
-rw-r--r-- | src/database.cc | 112 | ||||
-rw-r--r-- | src/kernels/common.opencl | 120 | ||||
-rw-r--r-- | src/kernels/copy.opencl | 73 | ||||
-rw-r--r-- | src/kernels/pad.opencl | 180 | ||||
-rw-r--r-- | src/kernels/padtranspose.opencl | 150 | ||||
-rw-r--r-- | src/kernels/transpose.opencl | 168 | ||||
-rw-r--r-- | src/kernels/xaxpy.opencl | 128 | ||||
-rw-r--r-- | src/kernels/xgemm.opencl | 570 | ||||
-rw-r--r-- | src/routine.cc | 326 | ||||
-rw-r--r-- | src/routines/xaxpy.cc | 115 | ||||
-rw-r--r-- | src/routines/xgemm.cc | 168 | ||||
-rw-r--r-- | src/routines/xsymm.cc | 132 | ||||
-rw-r--r-- | src/tuning/copy.cc | 83 | ||||
-rw-r--r-- | src/tuning/pad.cc | 90 | ||||
-rw-r--r-- | src/tuning/padtranspose.cc | 95 | ||||
-rw-r--r-- | src/tuning/transpose.cc | 88 | ||||
-rw-r--r-- | src/tuning/tuning.cc | 186 | ||||
-rw-r--r-- | src/tuning/xaxpy.cc | 88 | ||||
-rw-r--r-- | src/tuning/xgemm.cc | 126 | ||||
-rw-r--r-- | src/utilities.cc | 255 |
21 files changed, 3477 insertions, 0 deletions
diff --git a/src/clblast.cc b/src/clblast.cc new file mode 100644 index 00000000..72de3b24 --- /dev/null +++ b/src/clblast.cc @@ -0,0 +1,224 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements all the BLAS API calls. In all cases, it does not much more than creating +// a new object of the appropriate type, and calling the main routine on that object. It forwards +// all status codes to the caller. +// +// ================================================================================================= + +#include <string> + +#include "clblast.h" + +// BLAS level-1 includes +#include "internal/routines/xaxpy.h" + +// BLAS level-3 includes +#include "internal/routines/xgemm.h" +#include "internal/routines/xsymm.h" + +namespace clblast { +// ================================================================================================= +// BLAS level-1 (vector-vector) routines + +// AXPY +template <typename T> +StatusCode Axpy(const size_t n, const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = CommandQueue(*queue); + auto event_cpp = Event(*event); + auto xaxpy = Xaxpy<T>(queue_cpp, event_cpp); + + // Loads the kernel source-code as an include (C++11 raw string literal) + std::string kernel_source = + #include "kernels/xaxpy.opencl" + auto status = xaxpy.SetUp(kernel_source); + if (status != StatusCode::kSuccess) { return status; } + + // Runs the routine + return xaxpy.DoAxpy(n, alpha, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Axpy<float>(const size_t, const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Axpy<double>(const size_t, const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Axpy<float2>(const size_t, const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Axpy<double2>(const size_t, const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines + +// GEMM +template <typename T> +StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b, + const size_t m, const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = CommandQueue(*queue); + auto event_cpp = Event(*event); + auto xgemm = Xgemm<T>(queue_cpp, event_cpp); + + // Loads the kernel source-code as an include (C++11 raw string literal) + std::string common_source1 = + #include "kernels/copy.opencl" + std::string common_source2 = + #include "kernels/pad.opencl" + std::string common_source3 = + #include "kernels/transpose.opencl" + std::string common_source4 = + #include "kernels/padtranspose.opencl" + std::string kernel_source = + #include "kernels/xgemm.opencl" + auto status = xgemm.SetUp(common_source1 + common_source2 + common_source3 + common_source4 + + kernel_source); + if (status != StatusCode::kSuccess) { return status; } + + // Runs the routine + return xgemm.DoGemm(layout, transpose_a, transpose_b, + m, n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); +} +template StatusCode Gemm<float>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gemm<double>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +/* +template StatusCode Gemm<float2>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gemm<double2>(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +*/ + +// ================================================================================================= + +// SYMM +template <typename T> +StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = CommandQueue(*queue); + auto event_cpp = Event(*event); + auto xsymm = Xsymm<T>(queue_cpp, event_cpp); + + // Loads the kernel source-code as an include (C++11 raw string literal) + std::string common_source1 = + #include "kernels/copy.opencl" + std::string common_source2 = + #include "kernels/pad.opencl" + std::string common_source3 = + #include "kernels/transpose.opencl" + std::string common_source4 = + #include "kernels/padtranspose.opencl" + std::string kernel_source = + #include "kernels/xgemm.opencl" + auto status = xsymm.SetUp(common_source1 + common_source2 + common_source3 + common_source4 + + kernel_source); + if (status != StatusCode::kSuccess) { return status; } + + // Runs the routine + return xsymm.DoSymm(layout, side, triangle, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, + beta, + Buffer(c_buffer), c_offset, c_ld); +} +template StatusCode Symm<float>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Symm<double>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +/* +template StatusCode Symm<float2>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Symm<double2>(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +*/ + +// ================================================================================================= +} // namespace clblast diff --git a/src/database.cc b/src/database.cc new file mode 100644 index 00000000..beaa122b --- /dev/null +++ b/src/database.cc @@ -0,0 +1,112 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Database class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/database.h" +#include "internal/database/xaxpy.h" +#include "internal/database/xgemm.h" +#include "internal/database/copy.h" +#include "internal/database/pad.h" +#include "internal/database/transpose.h" +#include "internal/database/padtranspose.h" + +#include "internal/utilities.h" + +namespace clblast { +// ================================================================================================= + +// Initializes the database +const std::vector<Database::DatabaseEntry> Database::database = { + XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble, + XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, + CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, + PadSingle, PadDouble, PadComplexSingle, PadComplexDouble, + TraSingle, TraDouble, TraComplexSingle, TraComplexDouble, + PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble +}; + +// ================================================================================================= + +// Constructor, computing device properties and populating the parameter-vector from the database +Database::Database(const CommandQueue &queue, const std::vector<std::string> &kernels, + const Precision precision): + parameters_{} { + + // Finds information of the current device + auto device = queue.GetDevice(); + auto device_type = device.Type(); + auto device_vendor = device.Vendor(); + auto device_name = device.Name(); + + // Iterates over all kernels to include, and retrieves the parameters for each of them + for (auto &kernel: kernels) { + auto search_result = Search(kernel, device_type, device_vendor, device_name, precision); + parameters_.insert(search_result.begin(), search_result.end()); + } +} + +// ================================================================================================= + +// Returns a list of OpenCL pre-processor defines in string form +std::string Database::GetDefines() const { + std::string defines{}; + for (auto ¶meter: parameters_) { + defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n"; + } + return defines; +} + +// ================================================================================================= + +// Searches the database for the right kernel and precision +Database::Parameters Database::Search(const std::string &this_kernel, + const cl_device_type this_type, + const std::string &this_vendor, + const std::string &this_device, + const Precision this_precision) const { + for (auto &db: database) { + if (db.kernel == this_kernel && db.precision == this_precision) { + + // Searches for the right vendor and device type, or selects the default if unavailable. This + // assumes that the default vendor / device type is last in the database. + for (auto &vendor: db.vendors) { + if (VendorEqual(vendor.name, this_vendor) && + (vendor.type == this_type || vendor.type == CL_DEVICE_TYPE_ALL)) { + + // Searches for the right device. If the current device is unavailable, selects the vendor + // default parameters. This assumes the default is last in the database. + for (auto &device: vendor.devices) { + if (device.name == this_device || device.name == kDefault) { + + // Sets the parameters accordingly + return device.parameters; + } + } + } + } + } + } + + // If we reached this point, something is wrong + throw std::runtime_error("Database error, could not find a suitable entry"); +} + +// Determines the equality between two vendor names. This is implemented because vendor names can +// be ambigious and might change between different SDK or driver versions. +bool Database::VendorEqual(const std::string &db_vendor, const std::string &cl_vendor) const { + if (db_vendor == kDefault) { return true; } + if (db_vendor == cl_vendor) { return true; } + return false; +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl new file mode 100644 index 00000000..154265e4 --- /dev/null +++ b/src/kernels/common.opencl @@ -0,0 +1,120 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the common defines and type-defs for the CLBlast OpenCL kernels. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this file is used outside of the CLBlast library. +#ifndef PRECISION + #define PRECISION 32 // Data-types: single or double precision, complex or regular +#endif + +// ================================================================================================= + +// Enable support for double-precision +#if PRECISION == 64 || PRECISION == 6464 + #if __OPENCL_VERSION__ <= CL_VERSION_1_1 + #pragma OPENCL EXTENSION cl_khr_fp64: enable + #endif +#endif + +// Single-precision +#if PRECISION == 32 + typedef float real; + typedef float2 real2; + typedef float4 real4; + typedef float8 real8; + typedef float16 real16; + #define ZERO 0.0f + +// Double-precision +#elif PRECISION == 64 + typedef double real; + typedef double2 real2; + typedef double4 real4; + typedef double8 real8; + typedef double16 real16; + #define ZERO 0.0 + +// Complex single-precision +#elif PRECISION == 3232 + typedef struct cfloat {float x; float y;} real; + typedef struct cfloat2 {real x; real y;} real2; + typedef struct cfloat4 {real x; real y; real z; real w;} real4; + typedef struct cfloat8 {real s0; real s1; real s2; real s3; + real s4; real s5; real s6; real s7;} real8; + typedef struct cfloat16 {real s0; real s1; real s2; real s3; + real s4; real s5; real s6; real s7; + real s8; real s9; real sA; real sB; + real sC; real sD; real sE; real sF;} real16; + #define ZERO 0.0f + +// Complex Double-precision +#elif PRECISION == 6464 + typedef struct cdouble {double x; double y;} real; + typedef struct cdouble2 {real x; real y;} real2; + typedef struct cdouble4 {real x; real y; real z; real w;} real4; + typedef struct cdouble8 {real s0; real s1; real s2; real s3; + real s4; real s5; real s6; real s7;} real8; + typedef struct cdouble16 {real s0; real s1; real s2; real s3; + real s4; real s5; real s6; real s7; + real s8; real s9; real sA; real sB; + real sC; real sD; real sE; real sF;} real16; + #define ZERO 0.0 +#endif + +// ================================================================================================= + +// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction +#define USE_CL_MAD 0 + +// Sets a variable to zero +#if PRECISION == 3232 || PRECISION == 6464 + #define SetToZero(a) a.x = ZERO; a.y = ZERO +#else + #define SetToZero(a) a = ZERO +#endif + +// Multiply two complex variables (used in the define below) +#if PRECISION == 3232 || PRECISION == 6464 + #define MulReal(a, b) a.x*b.x - a.y*b.y + #define MulImag(a, b) a.x*b.y + a.y*b.x +#endif + +// The scalar multiply-add function +#if PRECISION == 3232 || PRECISION == 6464 + #define MultiplyAdd(c, a, b) c.x += MulReal(a,b); c.y += MulImag(a,b) +#else + #if USE_CL_MAD == 1 + #define MultiplyAdd(c, a, b) c = mad(a, b, c) + #else + #define MultiplyAdd(c, a, b) c += a * b + #endif +#endif + +// The scalar AXPBY function +#if PRECISION == 3232 || PRECISION == 6464 + #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d) +#else + #define AXPBY(e, a, b, c, d) e = a*b + c*d +#endif + +// ================================================================================================= + +// End of the C++11 raw string literal +)"; + +// ================================================================================================= diff --git a/src/kernels/copy.opencl b/src/kernels/copy.opencl new file mode 100644 index 00000000..f95b476b --- /dev/null +++ b/src/kernels/copy.opencl @@ -0,0 +1,73 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the common kernels shared among different BLAS routines. This file contains +// kernels to copy matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef COPY_DIMX + #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef COPY_DIMY + #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef COPY_WPT + #define COPY_WPT 1 // Work per thread in the first dimension (x) +#endif +#ifndef COPY_VW + #define COPY_VW 1 // Vector width in the second dimension (y) +#endif + +// ================================================================================================= + +// Data-widths +#if COPY_VW == 1 + typedef real realC; +#elif COPY_VW == 2 + typedef real2 realC; +#elif COPY_VW == 4 + typedef real4 realC; +#elif COPY_VW == 8 + typedef real8 realC; +#elif COPY_VW == 16 + typedef real16 realC; +#endif + +// ================================================================================================= + +// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of +// COPY_VW. Also requires both matrices to be of the same dimensions and without offset. +__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +__kernel void CopyMatrix(const int ld, + __global const realC* restrict src, + __global realC* dest) { + #pragma unroll + for (int w_one=0; w_one<COPY_WPT; ++w_one) { + const int id_one = get_global_id(0); + const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1); + const int id = id_two*(ld/COPY_VW) + id_one; + dest[id] = src[id]; + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)"; + +// ================================================================================================= diff --git a/src/kernels/pad.opencl b/src/kernels/pad.opencl new file mode 100644 index 00000000..ccaeb9d6 --- /dev/null +++ b/src/kernels/pad.opencl @@ -0,0 +1,180 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the common kernels shared among different BLAS routines. This file contains +// kernels to copy and pad matrices in various ways, including: +// 1) copying into a larger matrix by adding padding +// 2) copying into a smaller matrix by removing padding +// 3) from upper/lower triangle into a full matrix +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef PAD_DIMX + #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef PAD_DIMY + #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef PAD_WPTX + #define PAD_WPTX 1 // Work per thread in the first dimension (x) +#endif +#ifndef PAD_WPTY + #define PAD_WPTY 1 // Work per thread in the second dimension (y) +#endif + +// ================================================================================================= + +// Copies a matrix from source to destination. The output is padded with zero values in case the +// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld +// value and offset can be different. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void PadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_two && id_one < dest_one) { + + // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the + // value to be written to zero. + real value; + SetToZero(value); + if (id_two < src_two && id_one < src_one) { + value = src[id_two*src_ld + id_one + src_offset]; + } + + // Stores the value in the destination matrix + dest[id_two*dest_ld + id_one + dest_offset] = value; + } + } + } +} + +// ================================================================================================= + +// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but +// writes only the actual data back to the destination matrix. Again, the ld value and offset can +// be different. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void UnPadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_two && id_one < dest_one) { + + // Copies the value into the destination matrix. This is always within bounds of the source + // matrix, as we know that the destination matrix is smaller than the source. + dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset]; + } + } + } +} + +// ================================================================================================= + +// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is +// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void SymmLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_dim && id_one < dest_dim) { + + // Loads data from the lower-symmetric matrix + real value; + SetToZero(value); + if (id_two < src_dim && id_one < src_dim) { + if (id_two <= id_one) { value = src[id_two*src_ld + id_one + src_offset]; } + else { value = src[id_one*src_ld + id_two + src_offset]; } + } + + // Stores the value in the destination matrix + dest[id_two*dest_ld + id_one + dest_offset] = value; + } + } + } +} + +// Same as above, but now the matrix' data is stored in the upper-triangle +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void SymmUpperToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_dim && id_one < dest_dim) { + + // Loads data from the upper-symmetric matrix + real value; + SetToZero(value); + if (id_two < src_dim && id_one < src_dim) { + if (id_one <= id_two) { value = src[id_two*src_ld + id_one + src_offset]; } + else { value = src[id_one*src_ld + id_two + src_offset]; } + } + + // Stores the value in the destination matrix + dest[id_two*dest_ld + id_one + dest_offset] = value; + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)"; + +// ================================================================================================= diff --git a/src/kernels/padtranspose.opencl b/src/kernels/padtranspose.opencl new file mode 100644 index 00000000..67cbf341 --- /dev/null +++ b/src/kernels/padtranspose.opencl @@ -0,0 +1,150 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// kernels to transpose matrices in various ways, including: +// 1) transposing into a larger matrix by adding padding +// 2) transposing into a smaller matrix by removing padding +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef PADTRA_TILE + #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef PADTRA_WPT + #define PADTRA_WPT 1 // Amount of work per thread +#endif +#ifndef PADTRA_PAD + #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif + +// ================================================================================================= + +// Same as PadCopyMatrix, but now also does the transpose +__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +__kernel void PadTransposeMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Local memory to store a tile of the matrix (for coalescing) + __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; + + // Loop over the work per thread + #pragma unroll + for (int w_one=0; w_one<PADTRA_WPT; ++w_one) { + #pragma unroll + for (int w_two=0; w_two<PADTRA_WPT; ++w_two) { + + // Computes the identifiers for the source matrix. Note that the local and global dimensions + // do not correspond to each other! + const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0); + const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1); + + // Loads data into the local memory if the thread IDs are within bounds of the source matrix. + // Otherwise, set the local memory value to zero. + real value; + SetToZero(value); + if (id_src_two < src_two && id_src_one < src_one) { + value = src[id_src_two*src_ld + id_src_one + src_offset]; + } + tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value; + } + } + + // Synchronizes all threads in a workgroup + barrier(CLK_LOCAL_MEM_FENCE); + + // Loop over the work per thread + #pragma unroll + for (int w_one=0; w_one<PADTRA_WPT; ++w_one) { + #pragma unroll + for (int w_two=0; w_two<PADTRA_WPT; ++w_two) { + + // Computes the identifiers for the destination matrix + const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0); + const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1); + + // Stores the transposed value in the destination matrix + if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { + real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; + dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + } + } + } +} + +// Same as UnPadCopyMatrix, but now also does the transpose +__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +__kernel void UnPadTransposeMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Local memory to store a tile of the matrix (for coalescing) + __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; + + // Loop over the work per thread + #pragma unroll + for (int w_one=0; w_one<PADTRA_WPT; ++w_one) { + #pragma unroll + for (int w_two=0; w_two<PADTRA_WPT; ++w_two) { + + // Computes the identifiers for the source matrix. Note that the local and global dimensions + // do not correspond to each other! + const int id_src_one = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(0); + const int id_src_two = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(1); + + // Loads data into the local memory if the thread IDs are within bounds of the source matrix. + if ((id_src_one < src_one) && (id_src_two < src_two)) { + real value = src[id_src_two*src_ld + id_src_one + src_offset]; + tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value; + } + } + } + + // Synchronizes all threads in a workgroup + barrier(CLK_LOCAL_MEM_FENCE); + + // Loop over the work per thread + #pragma unroll + for (int w_one=0; w_one<PADTRA_WPT; ++w_one) { + #pragma unroll + for (int w_two=0; w_two<PADTRA_WPT; ++w_two) { + + // Computes the identifiers for the destination matrix + const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0); + const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1); + + // Stores the transposed value in the destination matrix + if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { + real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; + dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)"; + +// ================================================================================================= diff --git a/src/kernels/transpose.opencl b/src/kernels/transpose.opencl new file mode 100644 index 00000000..79ab1688 --- /dev/null +++ b/src/kernels/transpose.opencl @@ -0,0 +1,168 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// kernels to transpose matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef TRA_DIM + #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef TRA_WPT + #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other +#endif +#ifndef TRA_PAD + #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif + +// ================================================================================================= + +// Data-widths +#if TRA_WPT == 1 + typedef real realT; +#elif TRA_WPT == 2 + typedef real2 realT; +#elif TRA_WPT == 4 + typedef real4 realT; +#elif TRA_WPT == 8 + typedef real8 realT; +#elif TRA_WPT == 16 + typedef real16 realT; +#endif + +// ================================================================================================= + +// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without +// offset. A more general version is available in 'padtranspose.opencl'. +__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +__kernel void TransposeMatrix(const int ld, + __global const realT* restrict src, + __global realT* dest) { + + // Local memory to store a tile of the matrix (for coalescing) + __local real tile[TRA_WPT*TRA_DIM][TRA_WPT*TRA_DIM + TRA_PAD]; + + // Loop over the work per thread + #pragma unroll + for (int w_one=0; w_one<TRA_WPT; ++w_one) { + + // Computes the identifiers for the source matrix. Note that the local and global dimensions + // do not correspond to each other! + const int id_one = get_group_id(1) * TRA_DIM + get_local_id(0); + const int id_two = (get_group_id(0) * TRA_DIM + get_local_id(1))*TRA_WPT + w_one; + + // Loads data into the local memory + realT value = src[id_two*(ld/TRA_WPT) + id_one]; + #if TRA_WPT == 1 + tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value; + #elif TRA_WPT == 2 + tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x; + tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y; + #elif TRA_WPT == 4 + tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x; + tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y; + tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.z; + tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.w; + #elif TRA_WPT == 8 + tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0; + tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1; + tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2; + tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3; + tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4; + tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5; + tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6; + tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7; + #elif TRA_WPT == 16 + tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0; + tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1; + tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2; + tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3; + tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4; + tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5; + tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6; + tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7; + tile[get_local_id(1)*TRA_WPT + 8][get_local_id(0)*TRA_WPT + w_one] = value.s8; + tile[get_local_id(1)*TRA_WPT + 9][get_local_id(0)*TRA_WPT + w_one] = value.s9; + tile[get_local_id(1)*TRA_WPT + 10][get_local_id(0)*TRA_WPT + w_one] = value.sA; + tile[get_local_id(1)*TRA_WPT + 11][get_local_id(0)*TRA_WPT + w_one] = value.sB; + tile[get_local_id(1)*TRA_WPT + 12][get_local_id(0)*TRA_WPT + w_one] = value.sC; + tile[get_local_id(1)*TRA_WPT + 13][get_local_id(0)*TRA_WPT + w_one] = value.sD; + tile[get_local_id(1)*TRA_WPT + 14][get_local_id(0)*TRA_WPT + w_one] = value.sE; + tile[get_local_id(1)*TRA_WPT + 15][get_local_id(0)*TRA_WPT + w_one] = value.sF; + #endif + } + + // Synchronizes all threads in a workgroup + barrier(CLK_LOCAL_MEM_FENCE); + + // Loop over the work per thread + #pragma unroll + for (int w_two=0; w_two<TRA_WPT; ++w_two) { + + // Computes the identifiers for the destination matrix + const int id_one = get_global_id(0); + const int id_two = get_global_id(1)*TRA_WPT + w_two; + + // Stores the transposed value in the destination matrix + realT value; + #if TRA_WPT == 1 + value = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0]; + #elif TRA_WPT == 2 + value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0]; + value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1]; + #elif TRA_WPT == 4 + value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0]; + value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1]; + value.z = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2]; + value.w = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3]; + #elif TRA_WPT == 8 + value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0]; + value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1]; + value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2]; + value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3]; + value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4]; + value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5]; + value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6]; + value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7]; + #elif TRA_WPT == 16 + value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0]; + value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1]; + value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2]; + value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3]; + value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4]; + value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5]; + value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6]; + value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7]; + value.s8 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 8]; + value.s9 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 9]; + value.sA = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 10]; + value.sB = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 11]; + value.sC = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 12]; + value.sD = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 13]; + value.sE = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 14]; + value.sF = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 15]; + #endif + dest[id_two*(ld/TRA_WPT) + id_one] = value; + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)"; + +// ================================================================================================= diff --git a/src/kernels/xaxpy.opencl b/src/kernels/xaxpy.opencl new file mode 100644 index 00000000..40c6c3bd --- /dev/null +++ b/src/kernels/xaxpy.opencl @@ -0,0 +1,128 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit +// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't +// support vector data-types. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef WGS + #define WGS 64 // The local work-group size +#endif +#ifndef WPT + #define WPT 1 // The amount of work-per-thread +#endif +#ifndef VW + #define VW 1 // Vector width of vectors X and Y +#endif + +// ================================================================================================= + +// Data-widths +#if VW == 1 + typedef real realV; +#elif VW == 2 + typedef real2 realV; +#elif VW == 4 + typedef real4 realV; +#elif VW == 8 + typedef real8 realV; +#elif VW == 16 + typedef real16 realV; +#endif + +// ================================================================================================= + +// The vectorized multiply-add function +inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) { + #if VW == 1 + MultiplyAdd(cvec, aval, bvec); + #elif VW == 2 + MultiplyAdd(cvec.x, aval, bvec.x); + MultiplyAdd(cvec.y, aval, bvec.y); + #elif VW == 4 + MultiplyAdd(cvec.x, aval, bvec.x); + MultiplyAdd(cvec.y, aval, bvec.y); + MultiplyAdd(cvec.z, aval, bvec.z); + MultiplyAdd(cvec.w, aval, bvec.w); + #elif VW == 8 + MultiplyAdd(cvec.s0, aval, bvec.s0); + MultiplyAdd(cvec.s1, aval, bvec.s1); + MultiplyAdd(cvec.s2, aval, bvec.s2); + MultiplyAdd(cvec.s3, aval, bvec.s3); + MultiplyAdd(cvec.s4, aval, bvec.s4); + MultiplyAdd(cvec.s5, aval, bvec.s5); + MultiplyAdd(cvec.s6, aval, bvec.s6); + MultiplyAdd(cvec.s7, aval, bvec.s7); + #elif VW == 16 + MultiplyAdd(cvec.s0, aval, bvec.s0); + MultiplyAdd(cvec.s1, aval, bvec.s1); + MultiplyAdd(cvec.s2, aval, bvec.s2); + MultiplyAdd(cvec.s3, aval, bvec.s3); + MultiplyAdd(cvec.s4, aval, bvec.s4); + MultiplyAdd(cvec.s5, aval, bvec.s5); + MultiplyAdd(cvec.s6, aval, bvec.s6); + MultiplyAdd(cvec.s7, aval, bvec.s7); + MultiplyAdd(cvec.s8, aval, bvec.s8); + MultiplyAdd(cvec.s9, aval, bvec.s9); + MultiplyAdd(cvec.sA, aval, bvec.sA); + MultiplyAdd(cvec.sB, aval, bvec.sB); + MultiplyAdd(cvec.sC, aval, bvec.sC); + MultiplyAdd(cvec.sD, aval, bvec.sD); + MultiplyAdd(cvec.sE, aval, bvec.sE); + MultiplyAdd(cvec.sF, aval, bvec.sF); + #endif + return cvec; +} + +// ================================================================================================= + +// Full version of the kernel with offsets and strided accesses +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void Xaxpy(const int n, const real alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id<n; id += get_global_size(0)) { + MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xgm[id*x_inc + x_offset]); + } +} + +// ================================================================================================= + +// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is +// dividable by 'VW', 'WGS' and 'WPT'. +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void XaxpyFast(const int n, const real alpha, + const __global realV* restrict xgm, + __global realV* ygm) { + #pragma unroll + for (int w=0; w<WPT; ++w) { + const int id = w*get_global_size(0) + get_global_id(0); + ygm[id] = MultiplyAddVector(ygm[id], alpha, xgm[id]); + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)"; + +// ================================================================================================= diff --git a/src/kernels/xgemm.opencl b/src/kernels/xgemm.opencl new file mode 100644 index 00000000..facaf5dc --- /dev/null +++ b/src/kernels/xgemm.opencl @@ -0,0 +1,570 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto +// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable +// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It +// supports single and double precision (SGEMM/DGEMM) through a pre-processor define. +// +// Matrices are accessed as follows: +// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m) +// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n) +// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m) +// +// Or as an image (assuming column-major) +// K +// o-------o +// | | +// N | [B^T] | +// | | +// o-------o +// K N +// o-------o o-----o +// M | [A] | M | [C] | +// | | | | +// o-------o o-----o +// +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef MWG + #define MWG 8 // Tile-size in dimension M (e.g. 64, 128) +#endif +#ifndef NWG + #define NWG 8 // Tile-size in dimension N (e.g. 64, 128) +#endif +#ifndef KWG + #define KWG 8 // Tile-size in dimension K (e.g. 8, 16) +#endif +#ifndef MDIMC + #define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) +#endif +#ifndef NDIMC + #define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) +#endif +#ifndef MDIMA + #define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA +#endif +#ifndef NDIMB + #define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB +#endif +#ifndef KWI + #define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG) +#endif +#ifndef VWM + #define VWM 1 // Vector width of matrices A and C +#endif +#ifndef VWN + #define VWN 1 // Vector width of matrix B +#endif +#ifndef STRM + #define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0) +#endif +#ifndef STRN + #define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0) +#endif +#ifndef SA + #define SA 0 // Use local/shared memory to cache matrix A (1) or not (0) +#endif +#ifndef SB + #define SB 0 // Use local/shared memory to cache matrix B (1) or not (0) +#endif + +// Helper parameters based on the above tuning parameters +#define MWI (MWG/MDIMC) // Work per work-item (M-dimension) +#define NWI (NWG/NDIMC) // Work per work-item (N-dimension) +#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA +#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB +#define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension) +#define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension) +#define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension) +#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension) + +// Settings +#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually + +// ================================================================================================= + +// Data-widths in dimension M +#if VWM == 1 + typedef real realM; +#elif VWM == 2 + typedef real2 realM; +#elif VWM == 4 + typedef real4 realM; +#elif VWM == 8 + typedef real8 realM; +#elif VWM == 16 + typedef real16 realM; +#endif + +// Data-widths in dimension N +#if VWN == 1 + typedef real realN; +#elif VWN == 2 + typedef real2 realN; +#elif VWN == 4 + typedef real4 realN; +#elif VWN == 8 + typedef real8 realN; +#elif VWN == 16 + typedef real16 realN; +#endif + +// ================================================================================================= + +// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for +// caching the A input matrix. +#if SA == 1 +inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm, + const int kSizeM, const int tid, const int kwg) { + const int la0 = tid % MDIMA; + const int la1 = tid / MDIMA; + #pragma unroll + for (int mia=0; mia<MWA/VWM; ++mia) { + #pragma unroll + for (int kia=0; kia<KWA; ++kia) { + + // Computes the indices based on strided/non-strided access + #if STRM == 0 + int mg = mia + la0*(MWA/VWM); + #elif STRM == 1 + int mg = la0 + mia*MDIMA; + #endif + + // Computes the indices for the global memory + int kg = kia + la1*KWA; + int idm = mg + get_group_id(0)*(MWG/VWM); + int idk = kg + kwg; + + // Loads the data from global memory (not transposed) into the local memory + alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm]; + } + } +} +#endif + +// Same as above, but now for the B input matrix +#if SB == 1 +inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm, + const int kSizeN, const int tid, const int kwg) { + const int lb0 = tid % NDIMB; + const int lb1 = tid / NDIMB; + #pragma unroll + for (int kib=0; kib<KWB; ++kib) { + #pragma unroll + for (int nib=0; nib<NWB/VWN; ++nib) { + + // Computes the indices based on strided/non-strided access + #if STRN == 0 + int ng = nib + lb0*(NWB/VWN); + #elif STRN == 1 + int ng = lb0 + nib*NDIMB; + #endif + + // Computes the indices for the global memory + int kg = kib + lb1*KWB; + int idn = ng + get_group_id(1)*(NWG/VWN); + int idk = kg + kwg; + + // Loads the data from global memory (transposed) into the local memory + blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn]; + } + } +} +#endif + +// ================================================================================================= + +// Caches global off-chip memory directly into per-thread private memory (registers). This function +// is specific for caching the A input matrix. +#if SA == 0 +inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM], + const int kSizeM, const int idk, const int kwg) { + #pragma unroll + for (int mi=0; mi<MWI/VWM; ++mi) { + + // Computes the indices based on strided/non-strided access + #if STRM == 0 + int mg = mi + get_local_id(0)*(MWI/VWM); + #elif STRM == 1 + int mg = get_local_id(0) + mi*MDIMC; + #endif + + // Computes the indices for the global memory + int idm = mg + get_group_id(0)*(MWG/VWM); + + // Loads the data from global memory (not transposed) and stores into registers + apm[mi] = agm[idk*(kSizeM/VWM) + idm]; + } +} +#endif + +// Same as above, but now for the B input matrix +#if SB == 0 +inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN], + const int kSizeN, const int idk) { + #pragma unroll + for (int ni=0; ni<NWI/VWN; ++ni) { + + // Computes the indices based on strided/non-strided access + #if STRN == 0 + int ng = ni + get_local_id(1)*(NWI/VWN); + #elif STRN == 1 + int ng = get_local_id(1) + ni*NDIMC; + #endif + + // Computes the indices for the global memory + int idn = ng + get_group_id(1)*(NWG/VWN); + + // Loads the data from global memory (transposed) and stores into registers + bpm[ni] = bgm[idk*(kSizeN/VWN) + idn]; + } +} +#endif + +// ================================================================================================= + +// Caches on-chip local memory into per-thread private memory (registers). This function is specific +// for caching the A input matrix. +#if SA == 1 +inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) { + #pragma unroll + for (int mi=0; mi<MWI/VWM; ++mi) { + #if STRM == 0 + int mg = mi + get_local_id(0)*(MWI/VWM); + #elif STRM == 1 + int mg = get_local_id(0) + mi*MDIMC; + #endif + apm[mi] = alm[kg*(MWG/VWM) + mg]; + } +} +#endif + +// Same as above, but now for the B input matrix +#if SB == 1 +inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) { + #pragma unroll + for (int ni=0; ni<NWI/VWN; ++ni) { + #if STRN == 0 + int ng = ni + get_local_id(1)*(NWI/VWN); + #elif STRN == 1 + int ng = get_local_id(1) + ni*NDIMC; + #endif + bpm[ni] = blm[kg*(NWG/VWN) + ng]; + } +} +#endif + +// ================================================================================================= + +// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication +// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm +inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM, + const real alpha, const real beta) { + #pragma unroll + for (int ni=0; ni<NWI; ++ni) { + #pragma unroll + for (int mi=0; mi<MWI/VWM; ++mi) { + #if STRM == 0 + int mg = mi + get_local_id(0)*(MWI/VWM); + #elif STRM == 1 + int mg = get_local_id(0) + mi*MDIMC; + #endif + #if STRN == 0 + int ng = ni + get_local_id(1)*NWI; + #elif STRN == 1 + int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC; + #endif + int idm = mg + get_group_id(0)*(MWG/VWM); + int idn = ng + get_group_id(1)*NWG; + int index = idn*(kSizeM/VWM) + idm; + #if VWM == 1 + AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cgm[index]); + #elif VWM == 2 + AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cgm[index].x); + AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cgm[index].y); + #elif VWM == 4 + AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cgm[index].x); + AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cgm[index].y); + AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cgm[index].z); + AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cgm[index].w); + #elif VWM == 8 + AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cgm[index].s0); + AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cgm[index].s1); + AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cgm[index].s2); + AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cgm[index].s3); + AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cgm[index].s4); + AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cgm[index].s5); + AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cgm[index].s6); + AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cgm[index].s7); + #elif VWM == 16 + AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cgm[index].s0); + AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cgm[index].s1); + AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cgm[index].s2); + AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cgm[index].s3); + AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cgm[index].s4); + AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cgm[index].s5); + AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cgm[index].s6); + AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cgm[index].s7); + AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cgm[index].s8); + AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cgm[index].s9); + AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cgm[index].sA); + AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cgm[index].sB); + AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cgm[index].sC); + AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cgm[index].sD); + AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cgm[index].sE); + AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cgm[index].sF); + #endif + } + } +} + +// ================================================================================================= + +// The vectorised multiply-add function +inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) { + #if USE_VECTOR_MAD == 1 + cvec += avec * bval; + #else + #if VWM == 1 + MultiplyAdd(cvec, avec, bval); + #elif VWM == 2 + MultiplyAdd(cvec.x , avec.x, bval); + MultiplyAdd(cvec.y , avec.y, bval); + #elif VWM == 4 + MultiplyAdd(cvec.x , avec.x, bval); + MultiplyAdd(cvec.y , avec.y, bval); + MultiplyAdd(cvec.z , avec.z, bval); + MultiplyAdd(cvec.w , avec.w, bval); + #elif VWM == 8 + MultiplyAdd(cvec.s0, avec.s0, bval); + MultiplyAdd(cvec.s1, avec.s1, bval); + MultiplyAdd(cvec.s2, avec.s2, bval); + MultiplyAdd(cvec.s3, avec.s3, bval); + MultiplyAdd(cvec.s4, avec.s4, bval); + MultiplyAdd(cvec.s5, avec.s5, bval); + MultiplyAdd(cvec.s6, avec.s6, bval); + MultiplyAdd(cvec.s7, avec.s7, bval); + #elif VWM == 16 + MultiplyAdd(cvec.s0, avec.s0, bval); + MultiplyAdd(cvec.s1, avec.s1, bval); + MultiplyAdd(cvec.s2, avec.s2, bval); + MultiplyAdd(cvec.s3, avec.s3, bval); + MultiplyAdd(cvec.s4, avec.s4, bval); + MultiplyAdd(cvec.s5, avec.s5, bval); + MultiplyAdd(cvec.s6, avec.s6, bval); + MultiplyAdd(cvec.s7, avec.s7, bval); + MultiplyAdd(cvec.s8, avec.s8, bval); + MultiplyAdd(cvec.s9, avec.s9, bval); + MultiplyAdd(cvec.sA, avec.sA, bval); + MultiplyAdd(cvec.sB, avec.sB, bval); + MultiplyAdd(cvec.sC, avec.sC, bval); + MultiplyAdd(cvec.sD, avec.sD, bval); + MultiplyAdd(cvec.sE, avec.sE, bval); + MultiplyAdd(cvec.sF, avec.sF, bval); + #endif + #endif + return cvec; +} + +// Performs the actual computation: Cpm += Apm * Bpm +inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) { + #pragma unroll + for (int ni=0; ni<NWI/VWN; ++ni) { + #pragma unroll + for (int mi=0; mi<MWI/VWM; ++mi) { + #if VWN == 1 + cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni]); + #elif VWN == 2 + cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x); + cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y); + #elif VWN == 4 + cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x); + cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y); + cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].z); + cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].w); + #elif VWN == 8 + cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].s0); + cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].s1); + cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].s2); + cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].s3); + cpm[ni*VWN + 4][mi] = MultiplyAddVector(cpm[ni*VWN + 4][mi], apm[mi], bpm[ni].s4); + cpm[ni*VWN + 5][mi] = MultiplyAddVector(cpm[ni*VWN + 5][mi], apm[mi], bpm[ni].s5); + cpm[ni*VWN + 6][mi] = MultiplyAddVector(cpm[ni*VWN + 6][mi], apm[mi], bpm[ni].s6); + cpm[ni*VWN + 7][mi] = MultiplyAddVector(cpm[ni*VWN + 7][mi], apm[mi], bpm[ni].s7); + #elif VWN == 16 + cpm[ni*VWN + 0 ][mi] = MultiplyAddVector(cpm[ni*VWN + 0 ][mi], apm[mi], bpm[ni].s0); + cpm[ni*VWN + 1 ][mi] = MultiplyAddVector(cpm[ni*VWN + 1 ][mi], apm[mi], bpm[ni].s1); + cpm[ni*VWN + 2 ][mi] = MultiplyAddVector(cpm[ni*VWN + 2 ][mi], apm[mi], bpm[ni].s2); + cpm[ni*VWN + 3 ][mi] = MultiplyAddVector(cpm[ni*VWN + 3 ][mi], apm[mi], bpm[ni].s3); + cpm[ni*VWN + 4 ][mi] = MultiplyAddVector(cpm[ni*VWN + 4 ][mi], apm[mi], bpm[ni].s4); + cpm[ni*VWN + 5 ][mi] = MultiplyAddVector(cpm[ni*VWN + 5 ][mi], apm[mi], bpm[ni].s5); + cpm[ni*VWN + 6 ][mi] = MultiplyAddVector(cpm[ni*VWN + 6 ][mi], apm[mi], bpm[ni].s6); + cpm[ni*VWN + 7 ][mi] = MultiplyAddVector(cpm[ni*VWN + 7 ][mi], apm[mi], bpm[ni].s7); + cpm[ni*VWN + 8 ][mi] = MultiplyAddVector(cpm[ni*VWN + 8 ][mi], apm[mi], bpm[ni].s8); + cpm[ni*VWN + 9 ][mi] = MultiplyAddVector(cpm[ni*VWN + 9 ][mi], apm[mi], bpm[ni].s9); + cpm[ni*VWN + 10][mi] = MultiplyAddVector(cpm[ni*VWN + 10][mi], apm[mi], bpm[ni].sA); + cpm[ni*VWN + 11][mi] = MultiplyAddVector(cpm[ni*VWN + 11][mi], apm[mi], bpm[ni].sB); + cpm[ni*VWN + 12][mi] = MultiplyAddVector(cpm[ni*VWN + 12][mi], apm[mi], bpm[ni].sC); + cpm[ni*VWN + 13][mi] = MultiplyAddVector(cpm[ni*VWN + 13][mi], apm[mi], bpm[ni].sD); + cpm[ni*VWN + 14][mi] = MultiplyAddVector(cpm[ni*VWN + 14][mi], apm[mi], bpm[ni].sE); + cpm[ni*VWN + 15][mi] = MultiplyAddVector(cpm[ni*VWN + 15][mi], apm[mi], bpm[ni].sF); + #endif + } + } +} + +// ================================================================================================= + +// Main entry of the kernel. This function contains the basic skeleton, the functionality is +// provided by the inlined functions above +__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, + const real alpha, const real beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + + // Combined thread identifier + #if SA == 1 || SB == 1 + volatile int tid = get_local_id(0) + MDIMC*get_local_id(1); + #endif + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Allocates workitem-private memory (registers) + realM apm[MWI/VWM]; + realN bpm[NWI/VWN]; + realM cpm[NWI][MWI/VWM]; + + // Initializes the accumulation registers + #pragma unroll + for (int mi=0; mi<MWI/VWM; ++mi) { + #pragma unroll + for (int ni=0; ni<NWI; ++ni) { + #if VWM == 1 + SetToZero(cpm[ni][mi]); + #elif VWM == 2 + SetToZero(cpm[ni][mi].x); + SetToZero(cpm[ni][mi].y); + #elif VWM == 4 + SetToZero(cpm[ni][mi].x); + SetToZero(cpm[ni][mi].y); + SetToZero(cpm[ni][mi].z); + SetToZero(cpm[ni][mi].w); + #elif VWM == 8 + SetToZero(cpm[ni][mi].s0); + SetToZero(cpm[ni][mi].s1); + SetToZero(cpm[ni][mi].s2); + SetToZero(cpm[ni][mi].s3); + SetToZero(cpm[ni][mi].s4); + SetToZero(cpm[ni][mi].s5); + SetToZero(cpm[ni][mi].s6); + SetToZero(cpm[ni][mi].s7); + #elif VWM == 16 + SetToZero(cpm[ni][mi].s0); + SetToZero(cpm[ni][mi].s1); + SetToZero(cpm[ni][mi].s2); + SetToZero(cpm[ni][mi].s3); + SetToZero(cpm[ni][mi].s4); + SetToZero(cpm[ni][mi].s5); + SetToZero(cpm[ni][mi].s6); + SetToZero(cpm[ni][mi].s7); + SetToZero(cpm[ni][mi].s8); + SetToZero(cpm[ni][mi].s9); + SetToZero(cpm[ni][mi].sA); + SetToZero(cpm[ni][mi].sB); + SetToZero(cpm[ni][mi].sC); + SetToZero(cpm[ni][mi].sD); + SetToZero(cpm[ni][mi].sE); + SetToZero(cpm[ni][mi].sF); + #endif + } + } + + // Loops over all workgroup tiles + for (int kwg=0; kwg<kSizeK; kwg+=KWG) { + + // Loads data: off-chip --> local (matrix A) + #if SA == 1 + GlobalToLocalA(agm, alm, kSizeM, tid, kwg); + #endif + // Loads data: off-chip --> local (matrix B) + #if SB == 1 + GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); + #endif + + // Synchronizes all threads in a workgroup + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + + // Loops over all workitem tiles, unrolled by a factor KWI + for (int pwi=0; pwi<KWG; pwi+=KWI) { + #pragma unroll + for (int pit=0; pit<KWI; ++pit) { + #if SA == 0 || SB == 0 + int idk = kwg + pwi + pit; + #endif + #if SA == 1 || SB == 1 + int kg = pwi+pit; + #endif + + // Loads data: local --> private (matrix A) + #if SA == 1 + LocalToPrivateA(alm, apm, kg); + // Loads data: off-chip --> private (matrix A) + #else + GlobalToPrivateA(agm, apm, kSizeM, idk, kwg); + #endif + + // Loads data: local --> private (matrix B) + #if SB == 1 + LocalToPrivateB(blm, bpm, kg); + // Loads data: off-chip --> private (matrix B) + #else + GlobalToPrivateB(bgm, bpm, kSizeN, idk); + #endif + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulate(cpm, apm, bpm); + } + } + + // Synchronizes all threads in a workgroup + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + } + + // Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeM, alpha, beta); +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)"; + +// ================================================================================================= diff --git a/src/routine.cc b/src/routine.cc new file mode 100644 index 00000000..32face4a --- /dev/null +++ b/src/routine.cc @@ -0,0 +1,326 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Routine base class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routine.h" + +#include "internal/utilities.h" + +namespace clblast { +// ================================================================================================= + +// The cache of compiled OpenCL programs +std::vector<Routine::ProgramCache> Routine::program_cache_; + +// Constructor: not much here, because no status codes can be returned +Routine::Routine(CommandQueue &queue, Event &event, + const std::vector<std::string> &routines, const Precision precision): + precision_(precision), + queue_(queue), + event_(event), + context_(queue_.GetContext()), + device_(queue_.GetDevice()), + device_name_(device_.Name()), + max_work_item_dimensions_(device_.MaxWorkItemDimensions()), + max_work_item_sizes_(device_.MaxWorkItemSizes()), + max_work_group_size_(device_.MaxWorkGroupSize()), + db_(queue_, routines, precision_), + routines_(routines) { +} + +// ================================================================================================= + +// Separate set-up function to allow for status codes to be returned +StatusCode Routine::SetUp(const std::string &routine_source) { + + // Queries the cache to see whether or not the compiled kernel is already there. If not, it will + // be built and added to the cache. + if (!ProgramIsInCache()) { + + // Inspects whether or not cl_khr_fp64 is supported in case of double precision + auto extensions = device_.Extensions(); + if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) { + if (extensions.find(kKhronosDoublePrecision) == std::string::npos) { + return StatusCode::kNoDoublePrecision; + } + } + + // As above, but for cl_khr_fp16 (half precision) + if (precision_ == Precision::kHalf) { + if (extensions.find(kKhronosHalfPrecision) == std::string::npos) { + return StatusCode::kNoHalfPrecision; + } + } + + // Loads the common header (typedefs and defines and such) + std::string common_header = + #include "kernels/common.opencl" + + // Collects the parameters for this device in the form of defines, and adds the precision + auto defines = db_.GetDefines(); + defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n"; + auto source_string = defines + common_header + routine_source; + + // Compiles the kernel + try { + auto program = Program(context_, source_string); + auto options = std::string{}; + auto status = program.Build(device_, options); + + // Checks for compiler crashes/errors/warnings + if (status == CL_BUILD_PROGRAM_FAILURE) { + auto message = program.GetBuildInfo(device_); + fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); + return StatusCode::kBuildProgramFailure; + } + if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; } + + // Store the compiled program in the cache + program_cache_.push_back({program, device_name_, precision_, routines_}); + } catch (...) { return StatusCode::kBuildProgramFailure; } + } + + // No errors, normal termination of this function + return StatusCode::kSuccess; +} + +// ================================================================================================= + +// Enqueues a kernel, waits for completion, and checks for errors +StatusCode Routine::RunKernel(const Kernel &kernel, std::vector<size_t> &global, + const std::vector<size_t> &local) { + + // Tests for validity of the local thread sizes + if (local.size() > max_work_item_dimensions_) { + return StatusCode::kInvalidLocalNumDimensions; + } + for (auto i=size_t{0}; i<local.size(); ++i) { + if (local[i] > max_work_item_sizes_[i]) { return StatusCode::kInvalidLocalThreadsDim; } + } + auto local_size = size_t{1}; + for (auto &item: local) { local_size *= item; } + if (local_size > max_work_group_size_) { return StatusCode::kInvalidLocalThreadsTotal; } + + // Make sure the global thread sizes are at least equal to the local sizes + for (auto i=size_t{0}; i<global.size(); ++i) { + if (global[i] < local[i]) { global[i] = local[i]; } + } + + // Tests for local memory usage + auto local_mem_usage = kernel.LocalMemUsage(device_); + if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; } + + // Launches the kernel (and checks for launch errors) + auto status = queue_.EnqueueKernel(kernel, global, local, event_); + if (status != CL_SUCCESS) { return StatusCode::kKernelLaunchError; } + + // Waits for completion of the kernel + status = event_.Wait(); + if (status != CL_SUCCESS) { return StatusCode::kKernelRunError; } + + // No errors, normal termination of this function + return StatusCode::kSuccess; +} + +// ================================================================================================= + +// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a +// sufficient buffer size. +StatusCode Routine::TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld, const size_t data_size) { + if (ld < one) { return StatusCode::kInvalidLeadDimA; } + try { + auto required_size = (ld*two + offset)*data_size; + auto buffer_size = buffer.GetSize(); + if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; } + } catch (...) { return StatusCode::kInvalidMatrixA; } + return StatusCode::kSuccess; +} + +// Tests matrix B for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a +// sufficient buffer size. +StatusCode Routine::TestMatrixB(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld, const size_t data_size) { + if (ld < one) { return StatusCode::kInvalidLeadDimB; } + try { + auto required_size = (ld*two + offset)*data_size; + auto buffer_size = buffer.GetSize(); + if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; } + } catch (...) { return StatusCode::kInvalidMatrixB; } + return StatusCode::kSuccess; +} + +// Tests matrix C for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a +// sufficient buffer size. +StatusCode Routine::TestMatrixC(const size_t one, const size_t two, const Buffer &buffer, + const size_t offset, const size_t ld, const size_t data_size) { + if (ld < one) { return StatusCode::kInvalidLeadDimC; } + try { + auto required_size = (ld*two + offset)*data_size; + auto buffer_size = buffer.GetSize(); + if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; } + } catch (...) { return StatusCode::kInvalidMatrixC; } + return StatusCode::kSuccess; +} + +// ================================================================================================= + +// Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a +// sufficient buffer size. +StatusCode Routine::TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, + const size_t inc, const size_t data_size) { + if (inc == 0) { return StatusCode::kInvalidIncrementX; } + try { + auto required_size = (n*inc + offset)*data_size; + auto buffer_size = buffer.GetSize(); + if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; } + } catch (...) { return StatusCode::kInvalidVectorX; } + return StatusCode::kSuccess; +} + +// Tests vector Y for validity: checks for a valid increment, a valid OpenCL buffer, and for a +// sufficient buffer size. +StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, + const size_t inc, const size_t data_size) { + if (inc == 0) { return StatusCode::kInvalidIncrementY; } + try { + auto required_size = (n*inc + offset)*data_size; + auto buffer_size = buffer.GetSize(); + if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; } + } catch (...) { return StatusCode::kInvalidVectorY; } + return StatusCode::kSuccess; +} + +// ================================================================================================= + +// Copies a matrix and pads it with zeros +StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, + const size_t src_ld, const size_t src_offset, + const Buffer &src, + const size_t dest_one, const size_t dest_two, + const size_t dest_ld, const size_t dest_offset, + const Buffer &dest, + const bool do_transpose, const bool pad, + const Program &program) { + + // Determines whether or not the fast-version could potentially be used + auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && + (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld); + + // Determines the right kernel + auto kernel_name = std::string{}; + if (do_transpose) { + if (use_fast_kernel && + IsMultiple(src_ld, db_["TRA_WPT"]) && + IsMultiple(src_one, db_["TRA_WPT"]*db_["TRA_WPT"]) && + IsMultiple(src_two, db_["TRA_WPT"]*db_["TRA_WPT"])) { + kernel_name = "TransposeMatrix"; + } + else { + use_fast_kernel = false; + kernel_name = (pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix"; + } + } + else { + if (use_fast_kernel && + IsMultiple(src_ld, db_["COPY_VW"]) && + IsMultiple(src_one, db_["COPY_VW"]*db_["COPY_DIMX"]) && + IsMultiple(src_two, db_["COPY_WPT"]*db_["COPY_DIMY"])) { + kernel_name = "CopyMatrix"; + } + else { + use_fast_kernel = false; + kernel_name = (pad) ? "PadMatrix" : "UnPadMatrix"; + } + } + + // Retrieves the kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(src_ld)); + kernel.SetArgument(1, src()); + kernel.SetArgument(2, dest()); + } + else { + kernel.SetArgument(0, static_cast<int>(src_one)); + kernel.SetArgument(1, static_cast<int>(src_two)); + kernel.SetArgument(2, static_cast<int>(src_ld)); + kernel.SetArgument(3, static_cast<int>(src_offset)); + kernel.SetArgument(4, src()); + kernel.SetArgument(5, static_cast<int>(dest_one)); + kernel.SetArgument(6, static_cast<int>(dest_two)); + kernel.SetArgument(7, static_cast<int>(dest_ld)); + kernel.SetArgument(8, static_cast<int>(dest_offset)); + kernel.SetArgument(9, dest()); + } + + // Launches the kernel and returns the error code. Uses global and local thread sizes based on + // parameters in the database. + auto status = StatusCode::kSuccess; + if (do_transpose) { + if (use_fast_kernel) { + auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"], + dest_two / db_["TRA_WPT"]}; + auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]}; + status = RunKernel(kernel, global, local); + } + else { + auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]), + Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])}; + auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]}; + status = RunKernel(kernel, global, local); + } + } + else { + if (use_fast_kernel) { + auto global = std::vector<size_t>{dest_one / db_["COPY_VW"], + dest_two / db_["COPY_WPT"]}; + auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]}; + status = RunKernel(kernel, global, local); + } + else { + auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + status = RunKernel(kernel, global, local); + } + } + return status; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Queries the cache and retrieves a matching program. Assumes that the match is available, throws +// otherwise. +Program Routine::GetProgramFromCache() const { + for (auto &cached_program: program_cache_) { + if (cached_program.MatchInCache(device_name_, precision_, routines_)) { + return cached_program.program; + } + } + throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none."); +} + +// Queries the cache to see whether or not the compiled kernel is already there +bool Routine::ProgramIsInCache() const { + for (auto &cached_program: program_cache_) { + if (cached_program.MatchInCache(device_name_, precision_, routines_)) { return true; } + } + return false; +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/xaxpy.cc b/src/routines/xaxpy.cc new file mode 100644 index 00000000..309ae3ce --- /dev/null +++ b/src/routines/xaxpy.cc @@ -0,0 +1,115 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xaxpy class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/xaxpy.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle; +template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble; +template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle; +template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event): + Routine(queue, event, {"Xaxpy"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; + + // Retrieves the Xaxpy kernel from the compiled binary + try { + auto program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast<int>(x_offset)); + kernel.SetArgument(4, static_cast<int>(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast<int>(y_offset)); + kernel.SetArgument(7, static_cast<int>(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]); + auto global = std::vector<size_t>{CeilDiv(n_ceiled, db_["WPT"])}; + auto local = std::vector<size_t>{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xaxpy<float>; +template class Xaxpy<double>; +template class Xaxpy<float2>; +template class Xaxpy<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc new file mode 100644 index 00000000..16bbc154 --- /dev/null +++ b/src/routines/xgemm.cc @@ -0,0 +1,168 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xgemm class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/xgemm.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xgemm<float>::precision_ = Precision::kSingle; +template <> const Precision Xgemm<double>::precision_ = Precision::kDouble; +template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle; +template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xgemm<T>::Xgemm(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xgemm<T>::DoGemm(const Layout layout, + const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. Note + // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of + // col-major) to be transformed, so transposing requirements are not the same as whether or not + // the matrix is actually transposed in memory. + auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); + auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + auto a_do_transpose = a_rotated; + auto b_do_transpose = !b_rotated; + auto c_do_transpose = c_rotated; + + // Computes the first and second dimensions of the 3 matrices taking into account whether the + // matrices are rotated or not + auto a_one = (a_rotated) ? k : m; + auto a_two = (a_rotated) ? m : k; + auto b_one = (b_rotated) ? n : k; + auto b_two = (b_rotated) ? k : n; + auto c_one = (c_rotated) ? n : m; + auto c_two = (c_rotated) ? m : n; + + // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than K when rotated, or less than M when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N when rotated, or less than M when not-rotated + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of m, n, and k + auto m_ceiled = Ceil(m, db_["MWG"]); + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T)); + auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto program = GetProgramFromCache(); + + // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill + // them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + m_ceiled, k_ceiled, m_ceiled, 0, temp_a, + a_do_transpose, true, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b, + b_do_transpose, true, program); + if (ErrorIn(status)) { return status; } + + // Only necessary for matrix C if it used both as input and output + if (beta != static_cast<T>(0)) { + status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, + m_ceiled, n_ceiled, m_ceiled, 0, temp_c, + c_do_transpose, true, program); + if (ErrorIn(status)) { return status; } + } + + // Retrieves the Xgemm kernel from the compiled binary + try { + auto kernel = Kernel(program, "Xgemm"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(m_ceiled)); + kernel.SetArgument(1, static_cast<int>(n_ceiled)); + kernel.SetArgument(2, static_cast<int>(k_ceiled)); + kernel.SetArgument(3, alpha); + kernel.SetArgument(4, beta); + kernel.SetArgument(5, temp_a()); + kernel.SetArgument(6, temp_b()); + kernel.SetArgument(7, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector<size_t>{ + (m_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c, + c_one, c_two, c_ld, c_offset, c_buffer, + c_do_transpose, false, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgemm<float>; +template class Xgemm<double>; +template class Xgemm<float2>; +template class Xgemm<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/xsymm.cc b/src/routines/xsymm.cc new file mode 100644 index 00000000..aa43593d --- /dev/null +++ b/src/routines/xsymm.cc @@ -0,0 +1,132 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xsymm class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/xsymm.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xsymm<T>::Xsymm(CommandQueue &queue, Event &event): + Xgemm<T>(queue, event) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the + // left) or B (on the right) in the Xgemm routine. + size_t k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the squared A matrix + auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as + // default) and on whether we are dealing with an upper or lower triangle of the symmetrix matrix + bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; + + // Temporary buffer for a copy of the symmetric matrix + try { + auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + + // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm + // routine afterwards + try { + auto program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the arguments for the symmetric-to-squared kernel + kernel.SetArgument(0, static_cast<int>(k)); + kernel.SetArgument(1, static_cast<int>(a_ld)); + kernel.SetArgument(2, static_cast<int>(a_offset)); + kernel.SetArgument(3, a_buffer()); + kernel.SetArgument(4, static_cast<int>(k)); + kernel.SetArgument(5, static_cast<int>(k)); + kernel.SetArgument(6, static_cast<int>(0)); + kernel.SetArgument(7, temp_symm()); + + // Uses the common padding kernel's thread configuration. This is allowed, since the + // symmetry-to-squared kernel uses the same parameters. + auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the regular Xgemm code with either "C := AB+C" or ... + if (side == Side::kLeft) { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + temp_symm, 0, k, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld); + } + + // ... with "C := BA+C". Note that A and B are now reversed. + else { + status = DoGemm(layout, Transpose::kNo, Transpose::kNo, + m, n, k, + alpha, + b_buffer, b_offset, b_ld, + temp_symm, 0, k, + beta, + c_buffer, c_offset, c_ld); + + // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine + switch(status) { + case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break; + case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break; + case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break; + case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break; + case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break; + case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break; + } + } + + // Return the status of the Xgemm routine + return status; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsymm<float>; +template class Xsymm<double>; +template class Xsymm<float2>; +template class Xsymm<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc new file mode 100644 index 00000000..da223bf0 --- /dev/null +++ b/src/tuning/copy.cc @@ -0,0 +1,83 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements an auto-tuner to tune the copy OpenCL kernels. It uses CLTune. +// +// ================================================================================================= + +#include <string> +#include <vector> +#include <stdexcept> + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// The copy auto-tuner +template <typename T> +void CopyTune(const Arguments<T> &args, + const std::vector<T> &a_mat, std::vector<T> &b_mat, + cltune::Tuner &tuner) { + + // This points to the CopyMatrix kernel as found in the CLBlast library. This is just one example + // of a copy kernel. However, all copy-kernels use the same tuning parameters, so one has to be + // chosen as a representative. + std::string common_source = + #include "../src/kernels/common.opencl" + std::string kernel_source = + #include "../src/kernels/copy.opencl" + auto sources = common_source + kernel_source; + auto id = tuner.AddKernelFromString(sources, "CopyMatrix", {args.m, args.n}, {1, 1}); + tuner.SetReferenceFromString(sources, "CopyMatrix", {args.m, args.n}, {8, 8}); + + // Sets the tunable parameters and their possible values + tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32}); + tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32}); + tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8}); + tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8}); + + // Tests for a specific precision + tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); + tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); + + // Modifies the thread-sizes (both global and local) based on the parameters + tuner.MulLocalSize(id, {"COPY_DIMX", "COPY_DIMY"}); + tuner.DivGlobalSize(id, {"COPY_VW", "COPY_WPT"}); + + // Sets the function's arguments + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentOutput(b_mat); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void TunerCopy(int argc, char *argv[]) { + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: TunerAB<float>(argc, argv, CopyTune<float>); break; + case Precision::kDouble: TunerAB<double>(argc, argv, CopyTune<double>); break; + case Precision::kComplexSingle: TunerAB<float2>(argc, argv, CopyTune<float2>); break; + case Precision::kComplexDouble: TunerAB<double2>(argc, argv, CopyTune<double2>); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::TunerCopy(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc new file mode 100644 index 00000000..93312df2 --- /dev/null +++ b/src/tuning/pad.cc @@ -0,0 +1,90 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements an auto-tuner to tune the pad-copy OpenCL kernels. It uses CLTune. +// +// ================================================================================================= + +#include <string> +#include <vector> +#include <stdexcept> + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// The pad auto-tuner +template <typename T> +void PadTune(const Arguments<T> &args, + const std::vector<T> &a_mat, std::vector<T> &b_mat, + cltune::Tuner &tuner) { + + // This points to the PadMatrix kernel as found in the CLBlast library. This is just one + // example of a pad kernel. However, all pad-kernels use the same tuning parameters, so one has + // to be chosen as a representative. + std::string common_source = + #include "../src/kernels/common.opencl" + std::string kernel_source = + #include "../src/kernels/pad.opencl" + auto sources = common_source + kernel_source; + auto id = tuner.AddKernelFromString(sources, "PadMatrix", {args.m, args.n}, {1, 1}); + tuner.SetReferenceFromString(sources, "PadMatrix", {args.m, args.n}, {8, 8}); + + // Sets the tunable parameters and their possible values + tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32}); + tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32}); + tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4}); + tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4}); + + // Tests for a specific precision + tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); + tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); + + // Modifies the thread-sizes (both global and local) based on the parameters + tuner.MulLocalSize(id, {"PAD_DIMX", "PAD_DIMY"}); + tuner.DivGlobalSize(id, {"PAD_WPTX", "PAD_WPTY"}); + + // Sets the function's arguments + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentScalar(static_cast<int>(args.n)); + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentScalar(static_cast<int>(args.n)); + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentOutput(b_mat); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void TunerPad(int argc, char *argv[]) { + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: TunerAB<float>(argc, argv, PadTune<float>); break; + case Precision::kDouble: TunerAB<double>(argc, argv, PadTune<double>); break; + case Precision::kComplexSingle: TunerAB<float2>(argc, argv, PadTune<float2>); break; + case Precision::kComplexDouble: TunerAB<double2>(argc, argv, PadTune<double2>); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::TunerPad(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc new file mode 100644 index 00000000..b2af9925 --- /dev/null +++ b/src/tuning/padtranspose.cc @@ -0,0 +1,95 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements an auto-tuner to tune the pad-transpose OpenCL kernels. It uses CLTune. +// +// ================================================================================================= + +#include <string> +#include <vector> +#include <stdexcept> + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// The transpose auto-tuner +template <typename T> +void PadTransposeTune(const Arguments<T> &args, + const std::vector<T> &a_mat, std::vector<T> &b_mat, + cltune::Tuner &tuner) { + + // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one + // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has + // to be chosen as a representative. + std::string common_source = + #include "../src/kernels/common.opencl" + std::string kernel_source = + #include "../src/kernels/padtranspose.opencl" + auto sources = common_source + kernel_source; + auto id = tuner.AddKernelFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {1, 1}); + tuner.SetReferenceFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {8, 8}); + + // Sets the tunable parameters and their possible values + tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64}); + tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16}); + tuner.AddParameter(id, "PADTRA_PAD", {0, 1}); + + // Tests for a specific precision + tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); + tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); + + // Sets the constraints for local memory size limitations + auto LocalMemorySize = [args] (std::vector<size_t> v) { + return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"}); + + // Modifies the thread-sizes (both global and local) based on the parameters + tuner.DivGlobalSize(id, {"PADTRA_WPT", "PADTRA_WPT"}); + tuner.MulLocalSize(id, {"PADTRA_TILE", "PADTRA_TILE"}); + + // Sets the function's arguments + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentScalar(static_cast<int>(args.n)); + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(static_cast<int>(args.n)); + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentScalar(static_cast<int>(args.n)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentOutput(b_mat); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void TunerPadTranspose(int argc, char *argv[]) { + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: TunerAB<float>(argc, argv, PadTransposeTune<float>); break; + case Precision::kDouble: TunerAB<double>(argc, argv, PadTransposeTune<double>); break; + case Precision::kComplexSingle: TunerAB<float2>(argc, argv, PadTransposeTune<float2>); break; + case Precision::kComplexDouble: TunerAB<double2>(argc, argv, PadTransposeTune<double2>); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::TunerPadTranspose(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc new file mode 100644 index 00000000..90392866 --- /dev/null +++ b/src/tuning/transpose.cc @@ -0,0 +1,88 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements an auto-tuner to tune the transpose OpenCL kernels. It uses CLTune. +// +// ================================================================================================= + +#include <string> +#include <vector> +#include <stdexcept> + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// The transpose auto-tuner +template <typename T> +void TransposeTune(const Arguments<T> &args, + const std::vector<T> &a_mat, std::vector<T> &b_mat, + cltune::Tuner &tuner) { + + // This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one + // example of a transpose kernel. However, all kernels use the same tuning parameters, so one has + // to be chosen as a representative. + std::string common_source = + #include "../src/kernels/common.opencl" + std::string kernel_source = + #include "../src/kernels/transpose.opencl" + auto sources = common_source + kernel_source; + auto id = tuner.AddKernelFromString(sources, "TransposeMatrix", {args.m, args.n}, {1, 1}); + tuner.SetReferenceFromString(sources, "TransposeMatrix", {args.m, args.n}, {8, 8}); + + // Sets the tunable parameters and their possible values + tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64}); + tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16}); + tuner.AddParameter(id, "TRA_PAD", {0, 1}); + + // Tests for a specific precision + tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); + tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); + + // Sets the constraints for local memory size limitations + auto LocalMemorySize = [args] (std::vector<size_t> v) { + return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"}); + + // Modifies the thread-sizes (both global and local) based on the parameters + tuner.DivGlobalSize(id, {"TRA_WPT", "TRA_WPT"}); + tuner.MulLocalSize(id, {"TRA_DIM", "TRA_DIM"}); + + // Sets the function's arguments + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentOutput(b_mat); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void TunerTranspose(int argc, char *argv[]) { + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: TunerAB<float>(argc, argv, TransposeTune<float>); break; + case Precision::kDouble: TunerAB<double>(argc, argv, TransposeTune<double>); break; + case Precision::kComplexSingle: TunerAB<float2>(argc, argv, TransposeTune<float2>); break; + case Precision::kComplexDouble: TunerAB<double2>(argc, argv, TransposeTune<double2>); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::TunerTranspose(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc new file mode 100644 index 00000000..bb93c053 --- /dev/null +++ b/src/tuning/tuning.cc @@ -0,0 +1,186 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the common auto-tuning code to interface with the CLTune library. +// +// ================================================================================================= + +#include <string> +#include <vector> + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect +// the results. Used for vector-vector routines. +template <typename T> +void TunerXY(int argc, char* argv[], const Tuner2<T> &tune_function) { + + // Sets the parameters and platform/device for which to tune (command-line options) + auto help = std::string{"* Options given/available:\n"}; + auto args = Arguments<T>{}; + args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); + args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); + args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); + args.n = GetArgument(argc, argv, help, kArgN, size_t{4096*1024}); + args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); + fprintf(stdout, "%s\n", help.c_str()); + + // Creates input buffers with random data + auto x_vec = std::vector<T>(args.n); + auto y_vec = std::vector<T>(args.n); + PopulateVector(x_vec); + PopulateVector(y_vec); + + // Initializes the tuner for the chosen device + cltune::Tuner tuner(args.platform_id, args.device_id); + + // Use full-search to explore all parameter combinations. + tuner.UseFullSearch(); + + // Configures the tuning parameters (kernel specific) + tune_function(args, x_vec, y_vec, tuner); + + // Starts the tuning process + tuner.Tune(); + + // Prints the results to screen + auto time_ms = tuner.PrintToScreen(); + tuner.PrintFormatted(); + + // Also prints the performance of the best-case in terms of GB/s + const auto mega_bytes = (3*args.n*GetBytes(args.precision)) * 1.0e-6; + if (time_ms != 0.0) { + printf("[ -------> ] %.1lf ms or %.1lf GB/s\n", time_ms, mega_bytes/time_ms); + } +} + +// Compiles the above function +template void TunerXY<float>(int, char**, const Tuner2<float>&); +template void TunerXY<double>(int, char**, const Tuner2<double>&); +template void TunerXY<float2>(int, char**, const Tuner2<float2>&); +template void TunerXY<double2>(int, char**, const Tuner2<double2>&); + +// ================================================================================================= + +// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect +// the results. Used for matrix-matrix routines. +template <typename T> +void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function) { + + // Sets the parameters and platform/device for which to tune (command-line options) + auto help = std::string{"* Options given/available:\n"}; + auto args = Arguments<T>{}; + args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); + args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); + args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); + args.m = GetArgument(argc, argv, help, kArgM, size_t{1024}); + args.n = GetArgument(argc, argv, help, kArgN, size_t{1024}); + args.fraction = GetArgument(argc, argv, help, kArgFraction, 2048.0); + fprintf(stdout, "%s\n", help.c_str()); + + // Creates input buffers with random data + auto a_mat = std::vector<T>(args.m * args.n); + auto b_mat = std::vector<T>(args.m * args.n); + PopulateVector(a_mat); + PopulateVector(b_mat); + + // Initializes the tuner for the chosen device + cltune::Tuner tuner(args.platform_id, args.device_id); + + // Use full-search to explore all parameter combinations. + tuner.UseFullSearch(); + + // Configures the tuning parameters (kernel specific) + tune_function(args, a_mat, b_mat, tuner); + + // Starts the tuning process + tuner.Tune(); + + // Prints the results to screen + auto time_ms = tuner.PrintToScreen(); + tuner.PrintFormatted(); + + // Also prints the performance of the best-case in terms of GB/s + const auto mega_bytes = (2*args.m*args.n*GetBytes(args.precision)) * 1.0e-6; + if (time_ms != 0.0) { + printf("[ -------> ] %.1lf ms or %.1lf GB/s\n", time_ms, mega_bytes/time_ms); + } +} + +// Compiles the above function +template void TunerAB<float>(int, char**, const Tuner2<float>&); +template void TunerAB<double>(int, char**, const Tuner2<double>&); +template void TunerAB<float2>(int, char**, const Tuner2<float2>&); +template void TunerAB<double2>(int, char**, const Tuner2<double2>&); + +// ================================================================================================= + +// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect +// the results. Used for matrix-matrix-matrix routines. +template <typename T> +void TunerABC(int argc, char* argv[], const Tuner3<T> &tune_function) { + + // Sets the parameters and platform/device for which to tune (command-line options) + auto help = std::string{"* Options given/available:\n"}; + auto args = Arguments<T>{}; + args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); + args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); + args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); + args.m = GetArgument(argc, argv, help, kArgM, size_t{1024}); + args.n = GetArgument(argc, argv, help, kArgN, size_t{1024}); + args.k = GetArgument(argc, argv, help, kArgK, size_t{1024}); + args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); + args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); + args.fraction = GetArgument(argc, argv, help, kArgFraction, 2048.0); + fprintf(stdout, "%s\n", help.c_str()); + + // Creates input buffers with random data + auto a_mat = std::vector<T>(args.m * args.k); + auto b_mat = std::vector<T>(args.n * args.k); + auto c_mat = std::vector<T>(args.m * args.n); + PopulateVector(a_mat); + PopulateVector(b_mat); + PopulateVector(c_mat); + + // Initializes the tuner for the chosen device + cltune::Tuner tuner(args.platform_id, args.device_id); + + // Use random-search to search only a part of the parameter values. The fraction of the search- + // space to explore is set as a command-line argument. + tuner.UseRandomSearch(1.0/args.fraction); + + // Configures the tuning parameters (kernel specific) + tune_function(args, a_mat, b_mat, c_mat, tuner); + + // Starts the tuning process + tuner.Tune(); + + // Prints the results to screen + auto time_ms = tuner.PrintToScreen(); + tuner.PrintFormatted(); + + // Also prints the performance of the best-case in terms of GFLOPS + const auto mega_flops = (2*args.m*args.n*args.k) * 1.0e-6; + if (time_ms != 0.0) { + printf("[ -------> ] %.1lf ms or %.1lf GFLOPS\n", time_ms, mega_flops/time_ms); + } +} + +// Compiles the above function +template void TunerABC<float>(int, char**, const Tuner3<float>&); +template void TunerABC<double>(int, char**, const Tuner3<double>&); +template void TunerABC<float2>(int, char**, const Tuner3<float2>&); +template void TunerABC<double2>(int, char**, const Tuner3<double2>&); + +// ================================================================================================= +} // namespace clblast diff --git a/src/tuning/xaxpy.cc b/src/tuning/xaxpy.cc new file mode 100644 index 00000000..0439ed05 --- /dev/null +++ b/src/tuning/xaxpy.cc @@ -0,0 +1,88 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements an auto-tuner to tune the Xaxpy OpenCL kernel. It uses the CLTune library. +// +// ================================================================================================= + +#include <string> +#include <vector> +#include <stdexcept> + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// The Xaxpy auto-tuner +template <typename T> +void XaxpyTune(const Arguments<T> &args, + const std::vector<T> &x_vec, std::vector<T> &y_vec, + cltune::Tuner &tuner) { + + // The XaxpyFast kernel only works under certain conditions. Check here whether the condition is + // true for the reference kernel + if (!IsMultiple(args.n, 64)) { + throw std::runtime_error("The 'XaxpyFast' kernel requires 'n' to be a multiple of WGS*WPT*VW"); + } + + // This points to the XaxpyFast kernel as found in the CLBlast library + std::string common_source = + #include "../src/kernels/common.opencl" + std::string kernel_source = + #include "../src/kernels/xaxpy.opencl" + auto sources = common_source + kernel_source; + auto id = tuner.AddKernelFromString(sources, "XaxpyFast", {args.n}, {1}); + tuner.SetReferenceFromString(sources, "XaxpyFast", {args.n}, {64}); + + // Sets the tunable parameters and their possible values + tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 2048}); + tuner.AddParameter(id, "WPT", {1, 2, 4, 8}); + tuner.AddParameter(id, "VW", {1, 2, 4, 8}); + + // Tests for a specific precision + tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); + tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); + + // Modifies the thread-sizes (local) based on the parameters + tuner.MulLocalSize(id, {"WGS"}); + tuner.DivGlobalSize(id, {"WPT"}); + tuner.DivGlobalSize(id, {"VW"}); + + // Sets the function's arguments + tuner.AddArgumentScalar(static_cast<int>(args.n)); + tuner.AddArgumentScalar(args.alpha); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentOutput(y_vec); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void TunerXaxpy(int argc, char *argv[]) { + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: TunerXY<float>(argc, argv, XaxpyTune<float>); break; + case Precision::kDouble: TunerXY<double>(argc, argv, XaxpyTune<double>); break; + case Precision::kComplexSingle: TunerXY<float2>(argc, argv, XaxpyTune<float2>); break; + case Precision::kComplexDouble: TunerXY<double2>(argc, argv, XaxpyTune<double2>); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::TunerXaxpy(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/xgemm.cc b/src/tuning/xgemm.cc new file mode 100644 index 00000000..aba56810 --- /dev/null +++ b/src/tuning/xgemm.cc @@ -0,0 +1,126 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements an auto-tuner to tune the Xgemm OpenCL kernel. It uses the CLTune library. +// Note that this tuner uses random-search: running it multiple times or with a larger fraction +// argument might be neccessary to obtain good results. +// +// ================================================================================================= + +#include <string> +#include <vector> +#include <stdexcept> + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// The Xgemm auto-tuner +template <typename T> +void XgemmTune(const Arguments<T> &args, + const std::vector<T> &a_mat, const std::vector<T> &b_mat, std::vector<T> &c_mat, + cltune::Tuner &tuner) { + + // This points to the Xgemm kernel as found in the CLBlast library and its golden reference + std::string common_source = + #include "../src/kernels/common.opencl" + std::string kernel_source = + #include "../src/kernels/xgemm.opencl" + auto sources = common_source + kernel_source; + auto id = tuner.AddKernelFromString(sources, "Xgemm", {args.m, args.n}, {1, 1}); + tuner.SetReferenceFromString(sources, "Xgemm", {args.m, args.n}, {8, 8}); + + // Sets the tunable parameters and their possible values + tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "KWG", {16, 32}); + tuner.AddParameter(id, "MDIMC", {8, 16, 32}); + tuner.AddParameter(id, "NDIMC", {8, 16, 32}); + tuner.AddParameter(id, "MDIMA", {8, 16, 32}); + tuner.AddParameter(id, "NDIMB", {8, 16, 32}); + tuner.AddParameter(id, "KWI", {2, 8}); + tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); + tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); + tuner.AddParameter(id, "STRM", {0, 1}); + tuner.AddParameter(id, "STRN", {0, 1}); + tuner.AddParameter(id, "SA", {0, 1}); + tuner.AddParameter(id, "SB", {0, 1}); + + // Tests for a specific precision + tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); + tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); + + // Sets the helper functions to implement the constraints below + auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; + auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); }; + auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; + + // Sets constraints: Requirement for unrolling the KWG loop + tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"}); + + // Sets constraints: Required for integer MWI and NWI + tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}); + tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}); + + // Sets constraints: Required for integer MWIA and NWIB + tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}); + tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}); + + // Sets constraints: KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) + tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}); + tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}); + + // Sets the constraints for local memory size limitations + auto LocalMemorySize = [args] (std::vector<size_t> v) { + return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM", + "SB", "KWG", "NWG", "VWN"}); + + // Modifies the thread-sizes (both global and local) based on the parameters + tuner.MulLocalSize(id, {"MDIMC", "NDIMC"}); + tuner.MulGlobalSize(id, {"MDIMC", "NDIMC"}); + tuner.DivGlobalSize(id, {"MWG", "NWG"}); + + // Sets the function's arguments + tuner.AddArgumentScalar(static_cast<int>(args.m)); + tuner.AddArgumentScalar(static_cast<int>(args.n)); + tuner.AddArgumentScalar(static_cast<int>(args.k)); + tuner.AddArgumentScalar(args.alpha); + tuner.AddArgumentScalar(args.beta); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentInput(b_mat); + tuner.AddArgumentOutput(c_mat); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void TunerXgemm(int argc, char *argv[]) { + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: TunerABC<float>(argc, argv, XgemmTune<float>); break; + case Precision::kDouble: TunerABC<double>(argc, argv, XgemmTune<double>); break; + case Precision::kComplexSingle: TunerABC<float2>(argc, argv, XgemmTune<float2>); break; + case Precision::kComplexDouble: TunerABC<double2>(argc, argv, XgemmTune<double2>); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::TunerXgemm(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/src/utilities.cc b/src/utilities.cc new file mode 100644 index 00000000..80cea852 --- /dev/null +++ b/src/utilities.cc @@ -0,0 +1,255 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the common (test) utility functions. +// +// ================================================================================================= + +#include "internal/utilities.h" + +#include <string> +#include <vector> +#include <chrono> +#include <random> +#include <iomanip> + +namespace clblast { +// ================================================================================================= + +// Implements the string conversion using std::to_string if possible +template <typename T> +std::string ToString(T value) { + return std::to_string(value); +} +template std::string ToString<int>(int value); +template std::string ToString<size_t>(size_t value); +template std::string ToString<float>(float value); +template std::string ToString<double>(double value); + +// If not possible directly: special cases for complex data-types +template <> +std::string ToString(float2 value) { + std::ostringstream real, imag; + real << std::setprecision(2) << value.real(); + imag << std::setprecision(2) << value.imag(); + return real.str()+"+"+imag.str()+"i"; +} +template <> +std::string ToString(double2 value) { + std::ostringstream real, imag; + real << std::setprecision(2) << value.real(); + imag << std::setprecision(2) << value.imag(); + return real.str()+"+"+imag.str()+"i"; +} + +// If not possible directly: special cases for CLBlast data-types +template <> +std::string ToString(Layout value) { + switch(value) { + case Layout::kRowMajor: return ToString(static_cast<int>(value))+" (row-major)"; + case Layout::kColMajor: return ToString(static_cast<int>(value))+" (col-major)"; + } +} +template <> +std::string ToString(Transpose value) { + switch(value) { + case Transpose::kNo: return ToString(static_cast<int>(value))+" (regular)"; + case Transpose::kYes: return ToString(static_cast<int>(value))+" (transposed)"; + case Transpose::kConjugate: return ToString(static_cast<int>(value))+" (conjugate)"; + } +} +template <> +std::string ToString(Side value) { + switch(value) { + case Side::kLeft: return ToString(static_cast<int>(value))+" (left)"; + case Side::kRight: return ToString(static_cast<int>(value))+" (right)"; + } +} +template <> +std::string ToString(Triangle value) { + switch(value) { + case Triangle::kUpper: return ToString(static_cast<int>(value))+" (upper)"; + case Triangle::kLower: return ToString(static_cast<int>(value))+" (lower)"; + } +} +template <> +std::string ToString(Precision value) { + switch(value) { + case Precision::kHalf: return ToString(static_cast<int>(value))+" (half)"; + case Precision::kSingle: return ToString(static_cast<int>(value))+" (single)"; + case Precision::kDouble: return ToString(static_cast<int>(value))+" (double)"; + case Precision::kComplexSingle: return ToString(static_cast<int>(value))+" (complex-single)"; + case Precision::kComplexDouble: return ToString(static_cast<int>(value))+" (complex-double)"; + } +} + +// ================================================================================================= + +// Helper for the below function to convert the argument to the value type. Adds specialization for +// complex data-types. Note that complex arguments are accepted as regular values and are copied to +// both the real and imaginary parts. +template <typename T> +T ConvertArgument(const char* value) { + return static_cast<T>(std::stod(value)); +} +template <> float2 ConvertArgument(const char* value) { + auto val = static_cast<float>(std::stod(value)); + return float2{val, val}; +} +template <> double2 ConvertArgument(const char* value) { + auto val = static_cast<double>(std::stod(value)); + return double2{val, val}; +} + +// This function matches patterns in the form of "-option value" or "--option value". It returns a +// default value in case the option is not found in the argument string. +template <typename T> +T GetArgument(const int argc, char *argv[], std::string &help, + const std::string &option, const T default_value) { + + // Parses the argument. Note that this supports both the given option (e.g. -device) and one with + // an extra dash in front (e.g. --device). + auto return_value = static_cast<T>(default_value); + for (int c=0; c<argc; ++c) { + auto item = std::string{argv[c]}; + if (item.compare("-"+option) == 0 || item.compare("--"+option) == 0) { + ++c; + return_value = ConvertArgument<T>(argv[c]); + break; + } + } + + // Updates the help message and returns + help += " -"+option+" "+ToString(return_value)+" "; + help += (return_value == default_value) ? "[=default]\n" : "\n"; + return return_value; +} + +// Compiles the above function +template bool GetArgument<bool>(const int, char **, std::string&, const std::string&, const bool); +template int GetArgument<int>(const int, char **, std::string&, const std::string&, const int); +template size_t GetArgument<size_t>(const int, char **, std::string&, const std::string&, const size_t); +template float GetArgument<float>(const int, char **, std::string&, const std::string&, const float); +template double GetArgument<double>(const int, char **, std::string&, const std::string&, const double); +template float2 GetArgument<float2>(const int, char **, std::string&, const std::string&, const float2); +template double2 GetArgument<double2>(const int, char **, std::string&, const std::string&, const double2); +template Layout GetArgument<Layout>(const int, char **, std::string&, const std::string&, const Layout); +template Transpose GetArgument<Transpose>(const int, char **, std::string&, const std::string&, const Transpose); +template Side GetArgument<Side>(const int, char **, std::string&, const std::string&, const Side); +template Triangle GetArgument<Triangle>(const int, char **, std::string&, const std::string&, const Triangle); +template Precision GetArgument<Precision>(const int, char **, std::string&, const std::string&, const Precision); + +// ================================================================================================= + +// Returns only the precision argument +Precision GetPrecision(const int argc, char *argv[]) { + auto dummy = std::string{}; + return GetArgument(argc, argv, dummy, kArgPrecision, Precision::kSingle); +} + +// ================================================================================================= + +// Checks whether an argument is given. Returns true or false. +bool CheckArgument(const int argc, char *argv[], std::string &help, + const std::string &option) { + + // Updates the help message + help += " -"+option+"\n"; + + // Parses the argument. Note that this supports both the given option (e.g. -device) and one with + // an extra dash in front (e.g. --device). + for (int c=0; c<argc; ++c) { + auto item = std::string{argv[c]}; + if (item.compare("-"+option) == 0 || item.compare("--"+option) == 0) { ++c; return true; } + } + return false; +} + +// ================================================================================================= + +// Returns a random seed. This used to be implemented using 'std::random_device', but that doesn't +// always work. The chrono-timers are more reliable in that sense, but perhaps less random. +unsigned int GetRandomSeed() { + return static_cast<unsigned int>(std::chrono::system_clock::now().time_since_epoch().count()); +} + +// Create a random number generator and populates a vector with samples from a random distribution +template <typename T> +void PopulateVector(std::vector<T> &vector) { + std::mt19937 mt(GetRandomSeed()); + std::uniform_real_distribution<T> dist(static_cast<T>(-2.0), static_cast<T>(2.0)); + for (auto &element: vector) { element = dist(mt); } +} +template void PopulateVector<float>(std::vector<float>&); +template void PopulateVector<double>(std::vector<double>&); + +// Specialized versions of the above for complex data-types +template <> +void PopulateVector(std::vector<float2> &vector) { + std::mt19937 mt(GetRandomSeed()); + std::uniform_real_distribution<float> dist(-2.0f, 2.0f); + for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } +} +template <> +void PopulateVector(std::vector<double2> &vector) { + std::mt19937 mt(GetRandomSeed()); + std::uniform_real_distribution<double> dist(-2.0, 2.0); + for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } +} + +// ================================================================================================= + +// Returns a scalar with a default value +template <typename T> +T GetScalar() { + return static_cast<T>(2.0); +} +template float GetScalar<float>(); +template double GetScalar<double>(); + +// Specialized versions of the above for complex data-types +template <> +float2 GetScalar() { + return {2.0f, 0.5f}; +} +template <> +double2 GetScalar() { + return {2.0, 0.5}; +} + +// ================================================================================================= + +// Rounding functions performing ceiling and division operations +size_t CeilDiv(const size_t x, const size_t y) { + return 1 + ((x - 1) / y); +} +size_t Ceil(const size_t x, const size_t y) { + return CeilDiv(x,y)*y; +} + +// Helper function to determine whether or not 'a' is a multiple of 'b' +bool IsMultiple(const size_t a, const size_t b) { + return ((a/b)*b == a) ? true : false; +}; + +// ================================================================================================= + +// Convert the precision enum (as integer) into bytes +size_t GetBytes(const Precision precision) { + switch(precision) { + case Precision::kHalf: return 2; + case Precision::kSingle: return 4; + case Precision::kDouble: return 8; + case Precision::kComplexSingle: return 8; + case Precision::kComplexDouble: return 16; + } +} + +// ================================================================================================= +} // namespace clblast |