Added initial naive version of the batched GEMM routine based on the direct GEMM kernel

author: Cedric Nugteren <web@cedricnugteren.nl> 2017-03-11 16:02:45 +0100
committer: Cedric Nugteren <web@cedricnugteren.nl> 2017-03-11 16:02:45 +0100
commit: 7b8f8fce6808f2095a68afe97256db7a78f819fa (patch)
tree: 98f394e504740389cebf1b9061e269645e51ec7d
parent: 49e04c7fce8fed45559e143137cef3a1a36328cc (diff)
5 files changed, 182 insertions, 27 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 254d6b7b..34b81a81 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -15,6 +15,7 @@ Development version (next release)
   * STRSM/DTRSM/CTRSM/ZTRSM (experimental, un-optimized)
 - Added batched (non-BLAS) routines:
   * SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED (batched version of AXPY)
+  * SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED (batched version of GEMM)
 
 Version 0.10.0
 - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
diff --git a/README.md b/README.md
index 93a9b35f..d49648d9 100644
--- a/README.md
+++ b/README.md
@@ -276,6 +276,13 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
 | xTRMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
 | xTRSM    | ✔ | ✔ | ✔ | ✔ |   | (experimental, un-optimized)
 
+Futhermore, there are also batched versions of BLAS routines available, processing multiple smaller computations in one go for better performance:
+
+| Batched      | S | D | C | Z | H |
+| -------------|---|---|---|---|---|
+| xAXPYBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xGEMMBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ |
+
 In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care:
 
 | Level-X    | S | D | C | Z | H |
diff --git a/src/kernels/level3/xgemm_direct_batched.opencl b/src/kernels/level3/xgemm_direct_batched.opencl
new file mode 100644
index 00000000..3377d5c7
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_batched.opencl
@@ -0,0 +1,110 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the batched version of the GEMM kernels. See part 1 for information about the
+// non-batched version of the kernel.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
+                                   const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
+                                   const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
+                                   const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
+                                   __global real* cgm, const __constant int* c_offsets, const int c_ld,
+                                   const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  const int batch = get_group_id(2);
+  const real_arg arg_alpha = arg_alphas[batch];
+  const real_arg arg_beta = arg_betas[batch];
+  const int a_offset = a_offsets[batch];
+  const int b_offset = b_offsets[batch];
+  const int c_offset = c_offsets[batch];
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index dc8c64bc..658b22d0 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -104,19 +104,19 @@ void Xgemm<T>::DoGemm(const Layout layout,
   // Selects which version of GEMM to run
   const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
   if (do_gemm_direct) { // for small sizes (single kernel)
-    return GemmDirect(m, n, k, alpha,
-                      a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
-                      c_buffer, c_offset, c_ld,
-                      a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
+    GemmDirect(m, n, k, alpha,
+               a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+               c_buffer, c_offset, c_ld,
+               a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
   }
   else { // for larger sizes (pre/post-processing plus a very fast kernel)
-    return GemmIndirect(m, n, k, alpha,
-                        a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
-                        c_buffer, c_offset, c_ld,
-                        a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
-                        a_one, a_two, a_want_rotated,
-                        b_one, b_two, b_want_rotated,
-                        c_one, c_two, c_want_rotated);
+    GemmIndirect(m, n, k, alpha,
+                 a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+                 c_buffer, c_offset, c_ld,
+                 a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
+                 a_one, a_two, a_want_rotated,
+                 b_one, b_two, b_want_rotated,
+                 c_one, c_two, c_want_rotated);
   }
 }
 
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
index b07425d5..a11ebfd0 100644
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -22,25 +22,12 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name,
-            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
-            PrecisionValue<T>(), {}, {
-    #include "../../kernels/level3/level3.opencl"
-    #include "../../kernels/level3/copy_fast.opencl"
-    #include "../../kernels/level3/copy_pad.opencl"
-    #include "../../kernels/level3/transpose_fast.opencl"
-    #include "../../kernels/level3/transpose_pad.opencl"
-    #include "../../kernels/level3/convert_symmetric.opencl"
-    #include "../../kernels/level3/convert_triangular.opencl"
-    #include "../../kernels/level3/convert_hermitian.opencl"
-    , // separated in multiple parts to prevent C1091 in MSVC 2013
+    Routine(queue, event, name, {"XgemmDirect"}, PrecisionValue<T>(), {}, {
     #include "../../kernels/level3/xgemm_direct_part1.opencl"
     #include "../../kernels/level3/xgemm_direct_part2.opencl"
     #include "../../kernels/level3/xgemm_direct_part3.opencl"
     , // separated in multiple parts to prevent C1091 in MSVC 2013
-    #include "../../kernels/level3/xgemm_part1.opencl"
-    #include "../../kernels/level3/xgemm_part2.opencl"
-    #include "../../kernels/level3/xgemm_part3.opencl"
+    #include "../../kernels/level3/xgemm_direct_batched.opencl"
     }) {
 }
 
@@ -99,7 +86,57 @@ void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_trans
     TestMatrixC(c_one, c_two, c_buffer, c_offsets[batch], c_ld);
   }
 
-  // StatusCode::kNotImplemented;
+  // Upload the arguments to the device
+  std::vector<int> a_offsets_int(a_offsets.begin(), a_offsets.end());
+  std::vector<int> b_offsets_int(b_offsets.begin(), b_offsets.end());
+  std::vector<int> c_offsets_int(c_offsets.begin(), c_offsets.end());
+  auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
+  auto betas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
+  a_offsets_device.Write(queue_, batch_count, a_offsets_int);
+  b_offsets_device.Write(queue_, batch_count, b_offsets_int);
+  c_offsets_device.Write(queue_, batch_count, c_offsets_int);
+  alphas_device.Write(queue_, batch_count, alphas);
+  betas_device.Write(queue_, batch_count, betas);
+
+  // Retrieves the proper XgemmDirect kernel from the compiled binary
+  const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectBatchedTT" : "XgemmDirectBatchedTN") :
+                                       (b_do_transpose ? "XgemmDirectBatchedNT" : "XgemmDirectBatchedNN");
+  auto kernel = Kernel(program_, name);
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(m));
+  kernel.SetArgument(1, static_cast<int>(n));
+  kernel.SetArgument(2, static_cast<int>(k));
+  kernel.SetArgument(3, alphas_device());
+  kernel.SetArgument(4, betas_device());
+  kernel.SetArgument(5, a_buffer());
+  kernel.SetArgument(6, a_offsets_device());
+  kernel.SetArgument(7, static_cast<int>(a_ld));
+  kernel.SetArgument(8, b_buffer());
+  kernel.SetArgument(9, b_offsets_device());
+  kernel.SetArgument(10, static_cast<int>(b_ld));
+  kernel.SetArgument(11, c_buffer());
+  kernel.SetArgument(12, c_offsets_device());
+  kernel.SetArgument(13, static_cast<int>(c_ld));
+  kernel.SetArgument(14, static_cast<int>(c_do_transpose));
+  kernel.SetArgument(15, static_cast<int>(a_conjugate));
+  kernel.SetArgument(16, static_cast<int>(b_conjugate));
+
+  // Computes the global and local thread sizes
+  const auto m_ceiled = Ceil(m, db_["WGD"]);
+  const auto n_ceiled = Ceil(n, db_["WGD"]);
+  const auto global = std::vector<size_t>{
+    (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
+    (n_ceiled * db_["NDIMCD"]) / db_["WGD"],
+    batch_count
+  };
+  const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"], 1};
+
+  // Launches the kernel
+  RunKernel(kernel, queue_, device_, global, local, event_);
 }
 
 // =================================================================================================
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-03-11 16:02:45 +0100
committer	Cedric Nugteren <web@cedricnugteren.nl>	2017-03-11 16:02:45 +0100
commit	7b8f8fce6808f2095a68afe97256db7a78f819fa (patch)
tree	98f394e504740389cebf1b9061e269645e51ec7d
parent	49e04c7fce8fed45559e143137cef3a1a36328cc (diff)