Added first version of banded matrix-vector multiplication

author: CNugteren <web@cedricnugteren.nl> 2015-09-18 15:25:20 +0200
committer: CNugteren <web@cedricnugteren.nl> 2015-09-18 15:25:20 +0200
commit: 4507ba4997cd546418eae0972c018073ac7b36aa (patch)
tree: 08e549a9e4f174a85eb7d9a8efd3735b1daae44a
parent: 42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff)
19 files changed, 497 insertions, 44 deletions
diff --git a/.gitignore b/.gitignore
index 604b0a64..de7becef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 build
 stash
-.*
-\ No newline at end of file
+.*
+*.pyc
+\ No newline at end of file
diff --git a/CHANGELOG b/CHANGELOG
index 5a91d171..ff0646bd 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -7,6 +7,8 @@ Development version (next release)
   * SDOT/DDOT
   * CDOTU/ZDOTU
   * CDOTC/ZDOTC
+- Added level-2 routines:
+  * SGBMV/DGBMV/CGBMV/ZGBMV
 
 Version 0.4.0
 - Now using the Claduc C++11 interface to OpenCL
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1960bf1d..988eb683 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,7 +106,7 @@ set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm)
 set(SAMPLE_PROGRAMS_CPP sgemm)
 set(SAMPLE_PROGRAMS_C sgemm)
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
-set(LEVEL2_ROUTINES xgemv xhemv xsymv)
+set(LEVEL2_ROUTINES xgemv xgbmv xhemv xsymv)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
 set(PRECISIONS 32 3232 64 6464)
diff --git a/README.md b/README.md
index 5ffc18d6..98da13f5 100644
--- a/README.md
+++ b/README.md
@@ -153,7 +153,7 @@ CLBlast is in active development and currently does not support the full set of
 | Level-2  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
 | xGEMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xGBMV    |   |   |   |   |         |
+| xGBMV    | ✔ | ✔ | ✔ | ✔ |         |
 | xHEMV    | - | - | ✔ | ✔ |         |
 | xHBMV    | - | - |   |   |         |
 | xHPMV    | - | - |   |   |         |
diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h
new file mode 100644
index 00000000..763168d4
--- /dev/null
+++ b/include/internal/routines/level2/xgbmv.h
@@ -0,0 +1,58 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv routine. It is based on the generalized matrix multiplication
+// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the
+// "DoGemm" function directly. The "DoGbmv" function first preprocesses the banded matrix by
+// transforming it into a general matrix, and then calls the regular GEMV code.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGBMV_H_
+#define CLBLAST_ROUTINES_XGBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgbmv: public Xgemv<T> {
+ public:
+
+  // Members and methods from the base class
+  using Routine<T>::db_;
+  using Routine<T>::context_;
+  using Routine<T>::GetProgramFromCache;
+  using Routine<T>::TestMatrixA;
+  using Routine<T>::RunKernel;
+  using Routine<T>::ErrorIn;
+
+  // Uses the regular Xgemv routine
+  using Xgemv<T>::DoGemv;
+
+  // Constructor
+  Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n, const size_t kl, const size_t ku,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h
index 801b2fc3..311ad9f8 100644
--- a/include/internal/routines/level2/xhemv.h
+++ b/include/internal/routines/level2/xhemv.h
@@ -8,7 +8,9 @@
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xhemv routine. It is based on the generalized matrix multiplication
-// routine (Xgemv). The implementation is very similar to the Xsymv routine.
+// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the
+// "DoGemm" function directly. The "DoHemv" function first preprocesses the hermetian matrix by
+// transforming it into a general matrix, and then calls the regular GEMV code.
 //
 // =================================================================================================
 
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
index 466ac810..7ed84efb 100644
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@@ -41,6 +41,8 @@ const std::string kKhronosDoublePrecision = "cl_khr_fp64";
 constexpr auto kArgM = "m";
 constexpr auto kArgN = "n";
 constexpr auto kArgK = "k";
+constexpr auto kArgKL = "kl";
+constexpr auto kArgKU = "ku";
 constexpr auto kArgLayout = "layout";
 constexpr auto kArgATransp = "transA";
 constexpr auto kArgBTransp = "transB";
@@ -87,9 +89,11 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
 template <typename T>
 struct Arguments {
   // Routine-specific arguments
-  size_t m = 0;
-  size_t n = 0;
-  size_t k = 0;
+  size_t m = 1;
+  size_t n = 1;
+  size_t k = 1;
+  size_t ku = 1;
+  size_t kl = 1;
   Layout layout = Layout::kRowMajor;
   Transpose a_transpose = Transpose::kNo;
   Transpose b_transpose = Transpose::kNo;
@@ -100,9 +104,9 @@ struct Arguments {
   size_t y_inc = 1;
   size_t x_offset = 0;
   size_t y_offset = 0;
-  size_t a_ld = 0;
-  size_t b_ld = 0;
-  size_t c_ld = 0;
+  size_t a_ld = 1;
+  size_t b_ld = 1;
+  size_t c_ld = 1;
   size_t a_offset = 0;
   size_t b_offset = 0;
   size_t c_offset = 0;
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 677c8afc..d1171bc0 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -64,7 +64,7 @@ routines = [
 ],
 [ # Level 2: matrix-vector
   Routine(True,  "2a", "gemv",  T,  [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"),
-  Routine(False, "2a", "gbmv",  T,  [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"),
+  Routine(True,  "2a", "gbmv",  T,  [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"),
   Routine(True,  "2a", "hemv",  T,  [C,Z],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"),
   Routine(False, "2a", "hbmv",  T,  [C,Z],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"),
   Routine(False, "2a", "hpmv",  T,  [C,Z],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"),
@@ -237,7 +237,7 @@ files = [
   path_clblast+"/src/clblast_c.cc",
   path_clblast+"/test/wrapper_clblas.h",
 ]
-header_lines = [84, 44, 80, 24, 22]
+header_lines = [84, 45, 80, 24, 22]
 footer_lines = [6, 3, 5, 2, 6]
 
 # Checks whether the command-line arguments are valid; exists otherwise
diff --git a/src/clblast.cc b/src/clblast.cc
index a0dd8c70..ad5e354d 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -28,6 +28,7 @@
 
 // BLAS level-2 includes
 #include "internal/routines/level2/xgemv.h"
+#include "internal/routines/level2/xgbmv.h"
 #include "internal/routines/level2/xhemv.h"
 #include "internal/routines/level2/xsymv.h"
 
@@ -327,15 +328,26 @@ template StatusCode Gemv<double2>(const Layout, const Transpose,
 
 // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
 template <typename T>
-StatusCode Gbmv(const Layout, const Transpose,
-                const size_t, const size_t, const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                const T,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
+                const size_t m, const size_t n, const size_t kl, const size_t ku,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xgbmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoGbmv(layout, a_transpose,
+                        m, n, kl, ku,
+                        alpha,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        beta,
+                        Buffer<T>(y_buffer), y_offset, y_inc);
 }
 template StatusCode Gbmv<float>(const Layout, const Transpose,
                                 const size_t, const size_t, const size_t, const size_t,
diff --git a/src/kernels/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 1e12dd78..1e12dd78 100644
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
diff --git a/src/kernels/matrix_transforms/gbgemt.opencl b/src/kernels/matrix_transforms/gbgemt.opencl
new file mode 100644
index 00000000..e46e3a59
--- /dev/null
+++ b/src/kernels/matrix_transforms/gbgemt.opencl
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the general banded (gb) to general (ge) matrix transforms.
+//
+// This kernel uses the matrix-transforms common tuning parameters.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_GBMV)
+
+// Kernel to transform a general banded matrix into a general matrix
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void GeneralBandedToGeneral(const int src_one, const int src_two,
+                                     const int src_ld, const int src_offset,
+                                     __global const real* restrict src,
+                                     const int dest_one, const int dest_two,
+                                     const int dest_ld, const int dest_offset,
+                                     __global real* dest,
+                                     const int layout,
+                                     const int kl, const int ku) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_two && id_one < dest_one) {
+        real result;
+        SetToZero(result);
+        const int k = ku - id_two + id_one;
+        if ((id_one >= id_two - ku) && (id_one < id_two + kl + 1)) {
+          result = src[id_two*src_ld + k + src_offset];
+        }
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/matrix_transforms/transforms.opencl b/src/kernels/matrix_transforms/transforms.opencl
new file mode 100644
index 00000000..01889a13
--- /dev/null
+++ b/src/kernels/matrix_transforms/transforms.opencl
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common functions and parameters specific for matrix-transform kernels.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef PAD_DIMX
+  #define PAD_DIMX 8      // Local workgroup size in the first dimension (x)
+#endif
+#ifndef PAD_DIMY
+  #define PAD_DIMY 8      // Local workgroup size in the second dimension (y)
+#endif
+#ifndef PAD_WPTX
+  #define PAD_WPTX 1      // Work per thread in the first dimension (x)
+#endif
+#ifndef PAD_WPTY
+  #define PAD_WPTY 1      // Work per thread in the second dimension (y)
+#endif
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc
new file mode 100644
index 00000000..eac208b3
--- /dev/null
+++ b/src/routines/level2/xgbmv.cc
@@ -0,0 +1,117 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xgbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n, const size_t kl, const size_t ku,
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0 || m == 0) { return StatusCode::kInvalidDimension; }
+
+  //
+  auto rotated = (layout == Layout::kRowMajor);
+  auto t_one = (rotated) ? n : m;
+  auto t_two = (rotated) ? m : n;
+  auto a_one = kl+ku+1;
+  auto a_two = (rotated) ? m : n;
+
+  // Checks for validity of the A matrix
+  auto status = StatusCode::kSuccess;
+  if (a_ld < a_one) { return StatusCode::kInvalidLeadDimA; }
+  try {
+    auto required_size = (a_ld*a_two + a_offset)*sizeof(T);
+    auto buffer_size = a_buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
+  } catch (...) { return StatusCode::kInvalidMatrixA; }
+
+  // Temporary buffer to generalize the input matrix
+  try {
+    auto t_buffer = Buffer<T>(context_, t_one*t_two);
+
+    // Creates a general matrix from the input to be able to run the regular Xgemv routine
+    try {
+      auto& program = GetProgramFromCache();
+      auto kernel = Kernel(program, "GeneralBandedToGeneral");
+
+      // Sets the arguments for the matrix transform kernel
+      kernel.SetArgument(0, static_cast<int>(a_one));
+      kernel.SetArgument(1, static_cast<int>(a_two));
+      kernel.SetArgument(2, static_cast<int>(a_ld));
+      kernel.SetArgument(3, static_cast<int>(a_offset));
+      kernel.SetArgument(4, a_buffer());
+      kernel.SetArgument(5, static_cast<int>(t_one));
+      kernel.SetArgument(6, static_cast<int>(t_two));
+      kernel.SetArgument(7, static_cast<int>(t_one));
+      kernel.SetArgument(8, static_cast<int>(0));
+      kernel.SetArgument(9, t_buffer());
+      kernel.SetArgument(10, static_cast<int>(layout));
+      if (rotated) {
+        kernel.SetArgument(11, static_cast<int>(ku));
+        kernel.SetArgument(12, static_cast<int>(kl));
+      }
+      else {
+        kernel.SetArgument(11, static_cast<int>(kl));
+        kernel.SetArgument(12, static_cast<int>(ku));
+      }
+
+      // Uses the common matrix-transforms thread configuration
+      auto global = std::vector<size_t>{Ceil(CeilDiv(t_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(t_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemv code
+      status = DoGemv(layout, a_transpose, m, n, alpha,
+                      t_buffer, 0, t_one,
+                      x_buffer, x_offset, x_inc, beta,
+                      y_buffer, y_offset, y_inc);
+
+      // Return the status of the Xgemv routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgbmv<float>;
+template class Xgbmv<double>;
+template class Xgbmv<float2>;
+template class Xgbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
index f95a9957..e52d2f20 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -32,8 +32,10 @@ template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
     Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
   source_string_ =
-    #include "../../kernels/pad.opencl" // For {Herm,Symm}{Upper,Lower}ToSquared (for HEMV/SYMV)
-    #include "../../kernels/xgemv.opencl"
+    #include "../../kernels/pad.opencl" // TODO: replace
+    #include "../../kernels/matrix_transforms/transforms.opencl"
+    #include "../../kernels/matrix_transforms/gbgemt.opencl"
+    #include "../../kernels/level2/xgemv.opencl"
   ;
 }
 
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index 3d6fe595..6a066518 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -34,7 +34,7 @@ class TuneXgemv {
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/xgemv.opencl"
+      #include "../src/kernels/level2/xgemv.opencl"
     ;
   }
 
diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h
index 9e1d110c..d4e6f24b 100644
--- a/test/correctness/testblas.h
+++ b/test/correctness/testblas.h
@@ -49,6 +49,7 @@ class TestBlas: public Tester<T,U> {
   const std::vector<size_t> kIncrements = { 1, 2, 7 };
   const std::vector<size_t> kMatrixDims = { 7, 64 };
   const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
+  const std::vector<size_t> kBandSizes = { 4, 19 };
   const std::vector<size_t> kOffsets = GetOffsets();
   const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_);
   const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_);
@@ -121,6 +122,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
   auto ms = std::vector<size_t>{args.m};
   auto ns = std::vector<size_t>{args.n};
   auto ks = std::vector<size_t>{args.k};
+  auto kus = std::vector<size_t>{args.ku};
+  auto kls = std::vector<size_t>{args.kl};
   auto layouts = std::vector<Layout>{args.layout};
   auto a_transposes = std::vector<Transpose>{args.a_transpose};
   auto b_transposes = std::vector<Transpose>{args.b_transpose};
@@ -156,6 +159,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
     if (option == kArgM) { ms = dimensions; }
     if (option == kArgN) { ns = dimensions; }
     if (option == kArgK) { ks = dimensions; }
+    if (option == kArgKU) { kus = tester.kBandSizes; }
+    if (option == kArgKL) { kls = tester.kBandSizes; }
     if (option == kArgLayout) { layouts = tester.kLayouts; }
     if (option == kArgATransp) { a_transposes = C::GetATransposes(tester.kTransposes); }
     if (option == kArgBTransp) { b_transposes = C::GetBTransposes(tester.kTransposes); }
@@ -197,21 +202,25 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
               for (auto &m: ms) { r_args.m = m;
                 for (auto &n: ns) { r_args.n = n;
                   for (auto &k: ks) { r_args.k = k;
-                    for (auto &x_inc: x_incs) { r_args.x_inc = x_inc;
-                      for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset;
-                        for (auto &y_inc: y_incs) { r_args.y_inc = y_inc;
-                          for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset;
-                            for (auto &a_ld: a_lds) { r_args.a_ld = a_ld;
-                              for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset;
-                                for (auto &b_ld: b_lds) { r_args.b_ld = b_ld;
-                                  for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset;
-                                    for (auto &c_ld: c_lds) { r_args.c_ld = c_ld;
-                                      for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset;
-                                        for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset;
-                                          for (auto &alpha: alphas) { r_args.alpha = alpha;
-                                            for (auto &beta: betas) { r_args.beta = beta;
-                                              C::SetSizes(r_args);
-                                              regular_test_vector.push_back(r_args);
+                    for (auto &ku: kus) { r_args.ku = ku;
+                      for (auto &kl: kls) { r_args.kl = kl;
+                        for (auto &x_inc: x_incs) { r_args.x_inc = x_inc;
+                          for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset;
+                            for (auto &y_inc: y_incs) { r_args.y_inc = y_inc;
+                              for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset;
+                                for (auto &a_ld: a_lds) { r_args.a_ld = a_ld;
+                                  for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset;
+                                    for (auto &b_ld: b_lds) { r_args.b_ld = b_ld;
+                                      for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset;
+                                        for (auto &c_ld: c_lds) { r_args.c_ld = c_ld;
+                                          for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset;
+                                            for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset;
+                                              for (auto &alpha: alphas) { r_args.alpha = alpha;
+                                                for (auto &beta: betas) { r_args.beta = beta;
+                                                  C::SetSizes(r_args);
+                                                  regular_test_vector.push_back(r_args);
+                                                }
+                                              }
                                             }
                                           }
                                         }
@@ -232,7 +241,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
               // Creates the arguments vector for the invalid-buffer tests
               auto invalid_test_vector = std::vector<Arguments<U>>{};
               auto i_args = args;
-              i_args.m = i_args.n = i_args.k = tester.kBufferSize;
+              i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
               i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
               for (auto &x_size: x_sizes) { i_args.x_size = x_size;
                 for (auto &y_size: y_sizes) { i_args.y_size = y_size;
diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc
index f792925e..b1a58102 100644
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@@ -132,6 +132,8 @@ void Tester<T,U>::TestEnd() {
       if (o == kArgM)        { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); }
       if (o == kArgN)        { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); }
       if (o == kArgK)        { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); }
+      if (o == kArgKU)       { fprintf(stdout, "%s=%lu ", kArgKU, entry.args.ku); }
+      if (o == kArgKL)       { fprintf(stdout, "%s=%lu ", kArgKL, entry.args.kl); }
       if (o == kArgLayout)   { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);}
       if (o == kArgATransp)  { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);}
       if (o == kArgBTransp)  { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
diff --git a/test/performance/client.cc b/test/performance/client.cc
index 9faa4dca..a28ec5fd 100644
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@@ -48,9 +48,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
   for (auto &o: options_) {
 
     // Data-sizes
-    if (o == kArgM) { args.m  = GetArgument(argc, argv, help, kArgM, 512UL); }
-    if (o == kArgN) { args.n  = GetArgument(argc, argv, help, kArgN, 512UL); }
-    if (o == kArgK) { args.k  = GetArgument(argc, argv, help, kArgK, 512UL); }
+    if (o == kArgM)  { args.m   = GetArgument(argc, argv, help, kArgM, 512UL); }
+    if (o == kArgN)  { args.n   = GetArgument(argc, argv, help, kArgN, 512UL); }
+    if (o == kArgK)  { args.k   = GetArgument(argc, argv, help, kArgK, 512UL); }
+    if (o == kArgKU) { args.ku  = GetArgument(argc, argv, help, kArgKU, 128UL); }
+    if (o == kArgKL) { args.kl  = GetArgument(argc, argv, help, kArgKL, 128UL); }
 
     // Data-layouts
     if (o == kArgLayout)   { args.layout      = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@@ -225,8 +227,10 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas
   auto integers = std::vector<size_t>{};
   for (auto &o: options_) {
     if      (o == kArgM) {        integers.push_back(args.m); }
-    if      (o == kArgN) {        integers.push_back(args.n); }
+    else if (o == kArgN) {        integers.push_back(args.n); }
     else if (o == kArgK) {        integers.push_back(args.k); }
+    else if (o == kArgKU) {       integers.push_back(args.ku); }
+    else if (o == kArgKL) {       integers.push_back(args.kl); }
     else if (o == kArgLayout) {   integers.push_back(static_cast<size_t>(args.layout)); }
     else if (o == kArgSide) {     integers.push_back(static_cast<size_t>(args.side)); }
     else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); }
diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h
new file mode 100644
index 00000000..0e238804
--- /dev/null
+++ b/test/routines/level2/xgbmv.h
@@ -0,0 +1,140 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xgbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XGBMV_H_
+#define CLBLAST_TEST_ROUTINES_XGBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXgbmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN, kArgKL, kArgKU,
+            kArgLayout, kArgATransp, 
+            kArgALeadDim, kArgXInc, kArgYInc,
+            kArgAOffset, kArgXOffset, kArgYOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    auto n_real = (a_transposed) ? args.m : args.n;
+    return n_real * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    auto m_real = (a_transposed) ? args.n : args.m;
+    return m_real * args.y_inc + args.y_offset;
+  }
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_two = (a_rotated) ? args.m : args.n;
+    return a_two * args.a_ld + args.a_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Gbmv(args.layout, args.a_transpose,
+                       args.m, args.n, args.kl, args.ku, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.m, args.n, args.kl, args.ku, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.Read(queue, args.y_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    return (a_transposed) ? args.n : args.m;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_one = (a_rotated) ? args.n : args.m;
+    auto a_two = (a_rotated) ? args.m : args.n;
+    return ((args.kl+args.ku+1)*a_two + 2*a_one + a_two) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XGBMV_H_
+#endif
author	CNugteren <web@cedricnugteren.nl>	2015-09-18 15:25:20 +0200
committer	CNugteren <web@cedricnugteren.nl>	2015-09-18 15:25:20 +0200
commit	4507ba4997cd546418eae0972c018073ac7b36aa (patch)
tree	08e549a9e4f174a85eb7d9a8efd3735b1daae44a
parent	42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff)