summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCNugteren <web@cedricnugteren.nl>2015-09-18 15:25:20 +0200
committerCNugteren <web@cedricnugteren.nl>2015-09-18 15:25:20 +0200
commit4507ba4997cd546418eae0972c018073ac7b36aa (patch)
tree08e549a9e4f174a85eb7d9a8efd3735b1daae44a
parent42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff)
Added first version of banded matrix-vector multiplication
-rw-r--r--.gitignore3
-rw-r--r--CHANGELOG2
-rw-r--r--CMakeLists.txt2
-rw-r--r--README.md2
-rw-r--r--include/internal/routines/level2/xgbmv.h58
-rw-r--r--include/internal/routines/level2/xhemv.h4
-rw-r--r--include/internal/utilities.h16
-rw-r--r--scripts/generator/generator.py4
-rw-r--r--src/clblast.cc30
-rw-r--r--src/kernels/level2/xgemv.opencl (renamed from src/kernels/xgemv.opencl)0
-rw-r--r--src/kernels/matrix_transforms/gbgemt.opencl60
-rw-r--r--src/kernels/matrix_transforms/transforms.opencl40
-rw-r--r--src/routines/level2/xgbmv.cc117
-rw-r--r--src/routines/level2/xgemv.cc6
-rw-r--r--src/tuning/xgemv.cc2
-rw-r--r--test/correctness/testblas.h41
-rw-r--r--test/correctness/tester.cc2
-rw-r--r--test/performance/client.cc12
-rw-r--r--test/routines/level2/xgbmv.h140
19 files changed, 497 insertions, 44 deletions
diff --git a/.gitignore b/.gitignore
index 604b0a64..de7becef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
build
stash
-.* \ No newline at end of file
+.*
+*.pyc \ No newline at end of file
diff --git a/CHANGELOG b/CHANGELOG
index 5a91d171..ff0646bd 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -7,6 +7,8 @@ Development version (next release)
* SDOT/DDOT
* CDOTU/ZDOTU
* CDOTC/ZDOTC
+- Added level-2 routines:
+ * SGBMV/DGBMV/CGBMV/ZGBMV
Version 0.4.0
- Now using the Claduc C++11 interface to OpenCL
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1960bf1d..988eb683 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,7 +106,7 @@ set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm)
set(SAMPLE_PROGRAMS_CPP sgemm)
set(SAMPLE_PROGRAMS_C sgemm)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
-set(LEVEL2_ROUTINES xgemv xhemv xsymv)
+set(LEVEL2_ROUTINES xgemv xgbmv xhemv xsymv)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
set(PRECISIONS 32 3232 64 6464)
diff --git a/README.md b/README.md
index 5ffc18d6..98da13f5 100644
--- a/README.md
+++ b/README.md
@@ -153,7 +153,7 @@ CLBlast is in active development and currently does not support the full set of
| Level-2 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMV | ✔ | ✔ | ✔ | ✔ | |
-| xGBMV | | | | | |
+| xGBMV | ✔ | ✔ | ✔ | ✔ | |
| xHEMV | - | - | ✔ | ✔ | |
| xHBMV | - | - | | | |
| xHPMV | - | - | | | |
diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h
new file mode 100644
index 00000000..763168d4
--- /dev/null
+++ b/include/internal/routines/level2/xgbmv.h
@@ -0,0 +1,58 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv routine. It is based on the generalized matrix multiplication
+// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the
+// "DoGemm" function directly. The "DoGbmv" function first preprocesses the banded matrix by
+// transforming it into a general matrix, and then calls the regular GEMV code.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGBMV_H_
+#define CLBLAST_ROUTINES_XGBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgbmv: public Xgemv<T> {
+ public:
+
+ // Members and methods from the base class
+ using Routine<T>::db_;
+ using Routine<T>::context_;
+ using Routine<T>::GetProgramFromCache;
+ using Routine<T>::TestMatrixA;
+ using Routine<T>::RunKernel;
+ using Routine<T>::ErrorIn;
+
+ // Uses the regular Xgemv routine
+ using Xgemv<T>::DoGemv;
+
+ // Constructor
+ Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h
index 801b2fc3..311ad9f8 100644
--- a/include/internal/routines/level2/xhemv.h
+++ b/include/internal/routines/level2/xhemv.h
@@ -8,7 +8,9 @@
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xhemv routine. It is based on the generalized matrix multiplication
-// routine (Xgemv). The implementation is very similar to the Xsymv routine.
+// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the
+// "DoGemm" function directly. The "DoHemv" function first preprocesses the hermetian matrix by
+// transforming it into a general matrix, and then calls the regular GEMV code.
//
// =================================================================================================
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
index 466ac810..7ed84efb 100644
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@@ -41,6 +41,8 @@ const std::string kKhronosDoublePrecision = "cl_khr_fp64";
constexpr auto kArgM = "m";
constexpr auto kArgN = "n";
constexpr auto kArgK = "k";
+constexpr auto kArgKL = "kl";
+constexpr auto kArgKU = "ku";
constexpr auto kArgLayout = "layout";
constexpr auto kArgATransp = "transA";
constexpr auto kArgBTransp = "transB";
@@ -87,9 +89,11 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
template <typename T>
struct Arguments {
// Routine-specific arguments
- size_t m = 0;
- size_t n = 0;
- size_t k = 0;
+ size_t m = 1;
+ size_t n = 1;
+ size_t k = 1;
+ size_t ku = 1;
+ size_t kl = 1;
Layout layout = Layout::kRowMajor;
Transpose a_transpose = Transpose::kNo;
Transpose b_transpose = Transpose::kNo;
@@ -100,9 +104,9 @@ struct Arguments {
size_t y_inc = 1;
size_t x_offset = 0;
size_t y_offset = 0;
- size_t a_ld = 0;
- size_t b_ld = 0;
- size_t c_ld = 0;
+ size_t a_ld = 1;
+ size_t b_ld = 1;
+ size_t c_ld = 1;
size_t a_offset = 0;
size_t b_offset = 0;
size_t c_offset = 0;
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 677c8afc..d1171bc0 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -64,7 +64,7 @@ routines = [
],
[ # Level 2: matrix-vector
Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"),
- Routine(False, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"),
+ Routine(True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"),
Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"),
Routine(False, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"),
Routine(False, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"),
@@ -237,7 +237,7 @@ files = [
path_clblast+"/src/clblast_c.cc",
path_clblast+"/test/wrapper_clblas.h",
]
-header_lines = [84, 44, 80, 24, 22]
+header_lines = [84, 45, 80, 24, 22]
footer_lines = [6, 3, 5, 2, 6]
# Checks whether the command-line arguments are valid; exists otherwise
diff --git a/src/clblast.cc b/src/clblast.cc
index a0dd8c70..ad5e354d 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -28,6 +28,7 @@
// BLAS level-2 includes
#include "internal/routines/level2/xgemv.h"
+#include "internal/routines/level2/xgbmv.h"
#include "internal/routines/level2/xhemv.h"
#include "internal/routines/level2/xsymv.h"
@@ -327,15 +328,26 @@ template StatusCode Gemv<double2>(const Layout, const Transpose,
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
template <typename T>
-StatusCode Gbmv(const Layout, const Transpose,
- const size_t, const size_t, const size_t, const size_t,
- const T,
- const cl_mem, const size_t, const size_t,
- const cl_mem, const size_t, const size_t,
- const T,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xgbmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoGbmv(layout, a_transpose,
+ m, n, kl, ku,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Gbmv<float>(const Layout, const Transpose,
const size_t, const size_t, const size_t, const size_t,
diff --git a/src/kernels/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 1e12dd78..1e12dd78 100644
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
diff --git a/src/kernels/matrix_transforms/gbgemt.opencl b/src/kernels/matrix_transforms/gbgemt.opencl
new file mode 100644
index 00000000..e46e3a59
--- /dev/null
+++ b/src/kernels/matrix_transforms/gbgemt.opencl
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the general banded (gb) to general (ge) matrix transforms.
+//
+// This kernel uses the matrix-transforms common tuning parameters.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_GBMV)
+
+// Kernel to transform a general banded matrix into a general matrix
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void GeneralBandedToGeneral(const int src_one, const int src_two,
+ const int src_ld, const int src_offset,
+ __global const real* restrict src,
+ const int dest_one, const int dest_two,
+ const int dest_ld, const int dest_offset,
+ __global real* dest,
+ const int layout,
+ const int kl, const int ku) {
+
+ // Loops over the work per thread in both dimensions
+ #pragma unroll
+ for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+ const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+ #pragma unroll
+ for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+ const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+ if (id_two < dest_two && id_one < dest_one) {
+ real result;
+ SetToZero(result);
+ const int k = ku - id_two + id_one;
+ if ((id_one >= id_two - ku) && (id_one < id_two + kl + 1)) {
+ result = src[id_two*src_ld + k + src_offset];
+ }
+ dest[id_two*dest_ld + id_one + dest_offset] = result;
+ }
+ }
+ }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/matrix_transforms/transforms.opencl b/src/kernels/matrix_transforms/transforms.opencl
new file mode 100644
index 00000000..01889a13
--- /dev/null
+++ b/src/kernels/matrix_transforms/transforms.opencl
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common functions and parameters specific for matrix-transform kernels.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef PAD_DIMX
+ #define PAD_DIMX 8 // Local workgroup size in the first dimension (x)
+#endif
+#ifndef PAD_DIMY
+ #define PAD_DIMY 8 // Local workgroup size in the second dimension (y)
+#endif
+#ifndef PAD_WPTX
+ #define PAD_WPTX 1 // Work per thread in the first dimension (x)
+#endif
+#ifndef PAD_WPTY
+ #define PAD_WPTY 1 // Work per thread in the second dimension (y)
+#endif
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc
new file mode 100644
index 00000000..eac208b3
--- /dev/null
+++ b/src/routines/level2/xgbmv.cc
@@ -0,0 +1,117 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xgbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+ // Makes sure all dimensions are larger than zero
+ if (n == 0 || m == 0) { return StatusCode::kInvalidDimension; }
+
+ //
+ auto rotated = (layout == Layout::kRowMajor);
+ auto t_one = (rotated) ? n : m;
+ auto t_two = (rotated) ? m : n;
+ auto a_one = kl+ku+1;
+ auto a_two = (rotated) ? m : n;
+
+ // Checks for validity of the A matrix
+ auto status = StatusCode::kSuccess;
+ if (a_ld < a_one) { return StatusCode::kInvalidLeadDimA; }
+ try {
+ auto required_size = (a_ld*a_two + a_offset)*sizeof(T);
+ auto buffer_size = a_buffer.GetSize();
+ if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
+ } catch (...) { return StatusCode::kInvalidMatrixA; }
+
+ // Temporary buffer to generalize the input matrix
+ try {
+ auto t_buffer = Buffer<T>(context_, t_one*t_two);
+
+ // Creates a general matrix from the input to be able to run the regular Xgemv routine
+ try {
+ auto& program = GetProgramFromCache();
+ auto kernel = Kernel(program, "GeneralBandedToGeneral");
+
+ // Sets the arguments for the matrix transform kernel
+ kernel.SetArgument(0, static_cast<int>(a_one));
+ kernel.SetArgument(1, static_cast<int>(a_two));
+ kernel.SetArgument(2, static_cast<int>(a_ld));
+ kernel.SetArgument(3, static_cast<int>(a_offset));
+ kernel.SetArgument(4, a_buffer());
+ kernel.SetArgument(5, static_cast<int>(t_one));
+ kernel.SetArgument(6, static_cast<int>(t_two));
+ kernel.SetArgument(7, static_cast<int>(t_one));
+ kernel.SetArgument(8, static_cast<int>(0));
+ kernel.SetArgument(9, t_buffer());
+ kernel.SetArgument(10, static_cast<int>(layout));
+ if (rotated) {
+ kernel.SetArgument(11, static_cast<int>(ku));
+ kernel.SetArgument(12, static_cast<int>(kl));
+ }
+ else {
+ kernel.SetArgument(11, static_cast<int>(kl));
+ kernel.SetArgument(12, static_cast<int>(ku));
+ }
+
+ // Uses the common matrix-transforms thread configuration
+ auto global = std::vector<size_t>{Ceil(CeilDiv(t_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+ Ceil(CeilDiv(t_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+ auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+ status = RunKernel(kernel, global, local);
+ if (ErrorIn(status)) { return status; }
+
+ // Runs the regular Xgemv code
+ status = DoGemv(layout, a_transpose, m, n, alpha,
+ t_buffer, 0, t_one,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc);
+
+ // Return the status of the Xgemv routine
+ return status;
+ } catch (...) { return StatusCode::kInvalidKernel; }
+ } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgbmv<float>;
+template class Xgbmv<double>;
+template class Xgbmv<float2>;
+template class Xgbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
index f95a9957..e52d2f20 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -32,8 +32,10 @@ template <typename T>
Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
source_string_ =
- #include "../../kernels/pad.opencl" // For {Herm,Symm}{Upper,Lower}ToSquared (for HEMV/SYMV)
- #include "../../kernels/xgemv.opencl"
+ #include "../../kernels/pad.opencl" // TODO: replace
+ #include "../../kernels/matrix_transforms/transforms.opencl"
+ #include "../../kernels/matrix_transforms/gbgemt.opencl"
+ #include "../../kernels/level2/xgemv.opencl"
;
}
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index 3d6fe595..6a066518 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -34,7 +34,7 @@ class TuneXgemv {
static std::string GetSources() {
return
#include "../src/kernels/common.opencl"
- #include "../src/kernels/xgemv.opencl"
+ #include "../src/kernels/level2/xgemv.opencl"
;
}
diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h
index 9e1d110c..d4e6f24b 100644
--- a/test/correctness/testblas.h
+++ b/test/correctness/testblas.h
@@ -49,6 +49,7 @@ class TestBlas: public Tester<T,U> {
const std::vector<size_t> kIncrements = { 1, 2, 7 };
const std::vector<size_t> kMatrixDims = { 7, 64 };
const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
+ const std::vector<size_t> kBandSizes = { 4, 19 };
const std::vector<size_t> kOffsets = GetOffsets();
const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_);
const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_);
@@ -121,6 +122,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
auto ms = std::vector<size_t>{args.m};
auto ns = std::vector<size_t>{args.n};
auto ks = std::vector<size_t>{args.k};
+ auto kus = std::vector<size_t>{args.ku};
+ auto kls = std::vector<size_t>{args.kl};
auto layouts = std::vector<Layout>{args.layout};
auto a_transposes = std::vector<Transpose>{args.a_transpose};
auto b_transposes = std::vector<Transpose>{args.b_transpose};
@@ -156,6 +159,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
if (option == kArgM) { ms = dimensions; }
if (option == kArgN) { ns = dimensions; }
if (option == kArgK) { ks = dimensions; }
+ if (option == kArgKU) { kus = tester.kBandSizes; }
+ if (option == kArgKL) { kls = tester.kBandSizes; }
if (option == kArgLayout) { layouts = tester.kLayouts; }
if (option == kArgATransp) { a_transposes = C::GetATransposes(tester.kTransposes); }
if (option == kArgBTransp) { b_transposes = C::GetBTransposes(tester.kTransposes); }
@@ -197,21 +202,25 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
for (auto &m: ms) { r_args.m = m;
for (auto &n: ns) { r_args.n = n;
for (auto &k: ks) { r_args.k = k;
- for (auto &x_inc: x_incs) { r_args.x_inc = x_inc;
- for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset;
- for (auto &y_inc: y_incs) { r_args.y_inc = y_inc;
- for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset;
- for (auto &a_ld: a_lds) { r_args.a_ld = a_ld;
- for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset;
- for (auto &b_ld: b_lds) { r_args.b_ld = b_ld;
- for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset;
- for (auto &c_ld: c_lds) { r_args.c_ld = c_ld;
- for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset;
- for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset;
- for (auto &alpha: alphas) { r_args.alpha = alpha;
- for (auto &beta: betas) { r_args.beta = beta;
- C::SetSizes(r_args);
- regular_test_vector.push_back(r_args);
+ for (auto &ku: kus) { r_args.ku = ku;
+ for (auto &kl: kls) { r_args.kl = kl;
+ for (auto &x_inc: x_incs) { r_args.x_inc = x_inc;
+ for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset;
+ for (auto &y_inc: y_incs) { r_args.y_inc = y_inc;
+ for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset;
+ for (auto &a_ld: a_lds) { r_args.a_ld = a_ld;
+ for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset;
+ for (auto &b_ld: b_lds) { r_args.b_ld = b_ld;
+ for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset;
+ for (auto &c_ld: c_lds) { r_args.c_ld = c_ld;
+ for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset;
+ for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset;
+ for (auto &alpha: alphas) { r_args.alpha = alpha;
+ for (auto &beta: betas) { r_args.beta = beta;
+ C::SetSizes(r_args);
+ regular_test_vector.push_back(r_args);
+ }
+ }
}
}
}
@@ -232,7 +241,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<U>>{};
auto i_args = args;
- i_args.m = i_args.n = i_args.k = tester.kBufferSize;
+ i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
for (auto &x_size: x_sizes) { i_args.x_size = x_size;
for (auto &y_size: y_sizes) { i_args.y_size = y_size;
diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc
index f792925e..b1a58102 100644
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@@ -132,6 +132,8 @@ void Tester<T,U>::TestEnd() {
if (o == kArgM) { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); }
if (o == kArgN) { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); }
if (o == kArgK) { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); }
+ if (o == kArgKU) { fprintf(stdout, "%s=%lu ", kArgKU, entry.args.ku); }
+ if (o == kArgKL) { fprintf(stdout, "%s=%lu ", kArgKL, entry.args.kl); }
if (o == kArgLayout) { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);}
if (o == kArgATransp) { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);}
if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
diff --git a/test/performance/client.cc b/test/performance/client.cc
index 9faa4dca..a28ec5fd 100644
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@@ -48,9 +48,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
for (auto &o: options_) {
// Data-sizes
- if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); }
- if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); }
- if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); }
+ if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); }
+ if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); }
+ if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); }
+ if (o == kArgKU) { args.ku = GetArgument(argc, argv, help, kArgKU, 128UL); }
+ if (o == kArgKL) { args.kl = GetArgument(argc, argv, help, kArgKL, 128UL); }
// Data-layouts
if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@@ -225,8 +227,10 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas
auto integers = std::vector<size_t>{};
for (auto &o: options_) {
if (o == kArgM) { integers.push_back(args.m); }
- if (o == kArgN) { integers.push_back(args.n); }
+ else if (o == kArgN) { integers.push_back(args.n); }
else if (o == kArgK) { integers.push_back(args.k); }
+ else if (o == kArgKU) { integers.push_back(args.ku); }
+ else if (o == kArgKL) { integers.push_back(args.kl); }
else if (o == kArgLayout) { integers.push_back(static_cast<size_t>(args.layout)); }
else if (o == kArgSide) { integers.push_back(static_cast<size_t>(args.side)); }
else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); }
diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h
new file mode 100644
index 00000000..0e238804
--- /dev/null
+++ b/test/routines/level2/xgbmv.h
@@ -0,0 +1,140 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xgbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XGBMV_H_
+#define CLBLAST_TEST_ROUTINES_XGBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXgbmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgM, kArgN, kArgKL, kArgKU,
+ kArgLayout, kArgATransp,
+ kArgALeadDim, kArgXInc, kArgYInc,
+ kArgAOffset, kArgXOffset, kArgYOffset,
+ kArgAlpha, kArgBeta};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ auto a_transposed = (args.a_transpose != Transpose::kNo);
+ auto n_real = (a_transposed) ? args.m : args.n;
+ return n_real * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeY(const Arguments<T> &args) {
+ auto a_transposed = (args.a_transpose != Transpose::kNo);
+ auto m_real = (a_transposed) ? args.n : args.m;
+ return m_real * args.y_inc + args.y_offset;
+ }
+ static size_t GetSizeA(const Arguments<T> &args) {
+ auto a_rotated = (args.layout == Layout::kRowMajor);
+ auto a_two = (a_rotated) ? args.m : args.n;
+ return a_two * args.a_ld + args.a_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.a_size = GetSizeA(args);
+ args.x_size = GetSizeX(args);
+ args.y_size = GetSizeY(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &all) { return all; }
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Gbmv(args.layout, args.a_transpose,
+ args.m, args.n, args.kl, args.ku, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.m, args.n, args.kl, args.ku, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.y_size, static_cast<T>(0));
+ buffers.y_vec.Read(queue, args.y_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ auto a_transposed = (args.a_transpose != Transpose::kNo);
+ return (a_transposed) ? args.n : args.m;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.y_inc + args.y_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.m * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ auto a_rotated = (args.layout == Layout::kRowMajor);
+ auto a_one = (a_rotated) ? args.n : args.m;
+ auto a_two = (a_rotated) ? args.m : args.n;
+ return ((args.kl+args.ku+1)*a_two + 2*a_one + a_two) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XGBMV_H_
+#endif