diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-03-11 16:02:45 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-03-11 16:02:45 +0100 |
commit | 7b8f8fce6808f2095a68afe97256db7a78f819fa (patch) | |
tree | 98f394e504740389cebf1b9061e269645e51ec7d | |
parent | 49e04c7fce8fed45559e143137cef3a1a36328cc (diff) |
Added initial naive version of the batched GEMM routine based on the direct GEMM kernel
-rw-r--r-- | CHANGELOG | 1 | ||||
-rw-r--r-- | README.md | 7 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_direct_batched.opencl | 110 | ||||
-rw-r--r-- | src/routines/level3/xgemm.cpp | 22 | ||||
-rw-r--r-- | src/routines/levelx/xgemmbatched.cpp | 69 |
5 files changed, 182 insertions, 27 deletions
@@ -15,6 +15,7 @@ Development version (next release) * STRSM/DTRSM/CTRSM/ZTRSM (experimental, un-optimized) - Added batched (non-BLAS) routines: * SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED (batched version of AXPY) + * SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED (batched version of GEMM) Version 0.10.0 - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header @@ -276,6 +276,13 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non- | xTRMM | ✔ | ✔ | ✔ | ✔ | ✔ | | xTRSM | ✔ | ✔ | ✔ | ✔ | | (experimental, un-optimized) +Futhermore, there are also batched versions of BLAS routines available, processing multiple smaller computations in one go for better performance: + +| Batched | S | D | C | Z | H | +| -------------|---|---|---|---|---| +| xAXPYBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ | +| xGEMMBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ | + In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care: | Level-X | S | D | C | Z | H | diff --git a/src/kernels/level3/xgemm_direct_batched.opencl b/src/kernels/level3/xgemm_direct_batched.opencl new file mode 100644 index 00000000..3377d5c7 --- /dev/null +++ b/src/kernels/level3/xgemm_direct_batched.opencl @@ -0,0 +1,110 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the batched version of the GEMM kernels. See part 1 for information about the +// non-batched version of the kernel. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK, + const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, + const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, + const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, + __global real* cgm, const __constant int* c_offsets, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + const int batch = get_group_id(2); + const real_arg arg_alpha = arg_alphas[batch]; + const real_arg arg_beta = arg_betas[batch]; + const int a_offset = a_offsets[batch]; + const int b_offset = b_offsets[batch]; + const int c_offset = c_offsets[batch]; + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK, + const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, + const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, + const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, + __global real* cgm, const __constant int* c_offsets, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + const int batch = get_group_id(2); + const real_arg arg_alpha = arg_alphas[batch]; + const real_arg arg_beta = arg_betas[batch]; + const int a_offset = a_offsets[batch]; + const int b_offset = b_offsets[batch]; + const int c_offset = c_offsets[batch]; + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK, + const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, + const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, + const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, + __global real* cgm, const __constant int* c_offsets, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + const int batch = get_group_id(2); + const real_arg arg_alpha = arg_alphas[batch]; + const real_arg arg_beta = arg_betas[batch]; + const int a_offset = a_offsets[batch]; + const int b_offset = b_offsets[batch]; + const int c_offset = c_offsets[batch]; + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [transposed, transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK, + const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas, + const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld, + const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld, + __global real* cgm, const __constant int* c_offsets, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + const int batch = get_group_id(2); + const real_arg arg_alpha = arg_alphas[batch]; + const real_arg arg_beta = arg_betas[batch]; + const int a_offset = a_offsets[batch]; + const int b_offset = b_offsets[batch]; + const int c_offset = c_offsets[batch]; + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate); +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index dc8c64bc..658b22d0 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -104,19 +104,19 @@ void Xgemm<T>::DoGemm(const Layout layout, // Selects which version of GEMM to run const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]); if (do_gemm_direct) { // for small sizes (single kernel) - return GemmDirect(m, n, k, alpha, - a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, - a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate); + GemmDirect(m, n, k, alpha, + a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, + c_buffer, c_offset, c_ld, + a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate); } else { // for larger sizes (pre/post-processing plus a very fast kernel) - return GemmIndirect(m, n, k, alpha, - a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, - a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, - a_one, a_two, a_want_rotated, - b_one, b_two, b_want_rotated, - c_one, c_two, c_want_rotated); + GemmIndirect(m, n, k, alpha, + a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, + c_buffer, c_offset, c_ld, + a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, + a_one, a_two, a_want_rotated, + b_one, b_two, b_want_rotated, + c_one, c_two, c_want_rotated); } } diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp index b07425d5..a11ebfd0 100644 --- a/src/routines/levelx/xgemmbatched.cpp +++ b/src/routines/levelx/xgemmbatched.cpp @@ -22,25 +22,12 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, - {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"}, - PrecisionValue<T>(), {}, { - #include "../../kernels/level3/level3.opencl" - #include "../../kernels/level3/copy_fast.opencl" - #include "../../kernels/level3/copy_pad.opencl" - #include "../../kernels/level3/transpose_fast.opencl" - #include "../../kernels/level3/transpose_pad.opencl" - #include "../../kernels/level3/convert_symmetric.opencl" - #include "../../kernels/level3/convert_triangular.opencl" - #include "../../kernels/level3/convert_hermitian.opencl" - , // separated in multiple parts to prevent C1091 in MSVC 2013 + Routine(queue, event, name, {"XgemmDirect"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/xgemm_direct_part1.opencl" #include "../../kernels/level3/xgemm_direct_part2.opencl" #include "../../kernels/level3/xgemm_direct_part3.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 - #include "../../kernels/level3/xgemm_part1.opencl" - #include "../../kernels/level3/xgemm_part2.opencl" - #include "../../kernels/level3/xgemm_part3.opencl" + #include "../../kernels/level3/xgemm_direct_batched.opencl" }) { } @@ -99,7 +86,57 @@ void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_trans TestMatrixC(c_one, c_two, c_buffer, c_offsets[batch], c_ld); } - // StatusCode::kNotImplemented; + // Upload the arguments to the device + std::vector<int> a_offsets_int(a_offsets.begin(), a_offsets.end()); + std::vector<int> b_offsets_int(b_offsets.begin(), b_offsets.end()); + std::vector<int> c_offsets_int(c_offsets.begin(), c_offsets.end()); + auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count); + auto betas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count); + a_offsets_device.Write(queue_, batch_count, a_offsets_int); + b_offsets_device.Write(queue_, batch_count, b_offsets_int); + c_offsets_device.Write(queue_, batch_count, c_offsets_int); + alphas_device.Write(queue_, batch_count, alphas); + betas_device.Write(queue_, batch_count, betas); + + // Retrieves the proper XgemmDirect kernel from the compiled binary + const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectBatchedTT" : "XgemmDirectBatchedTN") : + (b_do_transpose ? "XgemmDirectBatchedNT" : "XgemmDirectBatchedNN"); + auto kernel = Kernel(program_, name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(m)); + kernel.SetArgument(1, static_cast<int>(n)); + kernel.SetArgument(2, static_cast<int>(k)); + kernel.SetArgument(3, alphas_device()); + kernel.SetArgument(4, betas_device()); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, a_offsets_device()); + kernel.SetArgument(7, static_cast<int>(a_ld)); + kernel.SetArgument(8, b_buffer()); + kernel.SetArgument(9, b_offsets_device()); + kernel.SetArgument(10, static_cast<int>(b_ld)); + kernel.SetArgument(11, c_buffer()); + kernel.SetArgument(12, c_offsets_device()); + kernel.SetArgument(13, static_cast<int>(c_ld)); + kernel.SetArgument(14, static_cast<int>(c_do_transpose)); + kernel.SetArgument(15, static_cast<int>(a_conjugate)); + kernel.SetArgument(16, static_cast<int>(b_conjugate)); + + // Computes the global and local thread sizes + const auto m_ceiled = Ceil(m, db_["WGD"]); + const auto n_ceiled = Ceil(n, db_["WGD"]); + const auto global = std::vector<size_t>{ + (m_ceiled * db_["MDIMCD"]) / db_["WGD"], + (n_ceiled * db_["NDIMCD"]) / db_["WGD"], + batch_count + }; + const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"], 1}; + + // Launches the kernel + RunKernel(kernel, queue_, device_, global, local, event_); } // ================================================================================================= |