diff options
author | CNugteren <web@cedricnugteren.nl> | 2015-09-18 15:25:20 +0200 |
---|---|---|
committer | CNugteren <web@cedricnugteren.nl> | 2015-09-18 15:25:20 +0200 |
commit | 4507ba4997cd546418eae0972c018073ac7b36aa (patch) | |
tree | 08e549a9e4f174a85eb7d9a8efd3735b1daae44a | |
parent | 42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff) |
Added first version of banded matrix-vector multiplication
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | CHANGELOG | 2 | ||||
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | include/internal/routines/level2/xgbmv.h | 58 | ||||
-rw-r--r-- | include/internal/routines/level2/xhemv.h | 4 | ||||
-rw-r--r-- | include/internal/utilities.h | 16 | ||||
-rw-r--r-- | scripts/generator/generator.py | 4 | ||||
-rw-r--r-- | src/clblast.cc | 30 | ||||
-rw-r--r-- | src/kernels/level2/xgemv.opencl (renamed from src/kernels/xgemv.opencl) | 0 | ||||
-rw-r--r-- | src/kernels/matrix_transforms/gbgemt.opencl | 60 | ||||
-rw-r--r-- | src/kernels/matrix_transforms/transforms.opencl | 40 | ||||
-rw-r--r-- | src/routines/level2/xgbmv.cc | 117 | ||||
-rw-r--r-- | src/routines/level2/xgemv.cc | 6 | ||||
-rw-r--r-- | src/tuning/xgemv.cc | 2 | ||||
-rw-r--r-- | test/correctness/testblas.h | 41 | ||||
-rw-r--r-- | test/correctness/tester.cc | 2 | ||||
-rw-r--r-- | test/performance/client.cc | 12 | ||||
-rw-r--r-- | test/routines/level2/xgbmv.h | 140 |
19 files changed, 497 insertions, 44 deletions
@@ -1,3 +1,4 @@ build stash -.*
\ No newline at end of file +.* +*.pyc
\ No newline at end of file @@ -7,6 +7,8 @@ Development version (next release) * SDOT/DDOT * CDOTU/ZDOTU * CDOTC/ZDOTC +- Added level-2 routines: + * SGBMV/DGBMV/CGBMV/ZGBMV Version 0.4.0 - Now using the Claduc C++11 interface to OpenCL diff --git a/CMakeLists.txt b/CMakeLists.txt index 1960bf1d..988eb683 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,7 +106,7 @@ set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm) set(SAMPLE_PROGRAMS_CPP sgemm) set(SAMPLE_PROGRAMS_C sgemm) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc) -set(LEVEL2_ROUTINES xgemv xhemv xsymv) +set(LEVEL2_ROUTINES xgemv xgbmv xhemv xsymv) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES}) set(PRECISIONS 32 3232 64 6464) @@ -153,7 +153,7 @@ CLBlast is in active development and currently does not support the full set of | Level-2 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| | xGEMV | ✔ | ✔ | ✔ | ✔ | | -| xGBMV | | | | | | +| xGBMV | ✔ | ✔ | ✔ | ✔ | | | xHEMV | - | - | ✔ | ✔ | | | xHBMV | - | - | | | | | xHPMV | - | - | | | | diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h new file mode 100644 index 00000000..763168d4 --- /dev/null +++ b/include/internal/routines/level2/xgbmv.h @@ -0,0 +1,58 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xgbmv routine. It is based on the generalized matrix multiplication +// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the +// "DoGemm" function directly. The "DoGbmv" function first preprocesses the banded matrix by +// transforming it into a general matrix, and then calls the regular GEMV code. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGBMV_H_ +#define CLBLAST_ROUTINES_XGBMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xgbmv: public Xgemv<T> { + public: + + // Members and methods from the base class + using Routine<T>::db_; + using Routine<T>::context_; + using Routine<T>::GetProgramFromCache; + using Routine<T>::TestMatrixA; + using Routine<T>::RunKernel; + using Routine<T>::ErrorIn; + + // Uses the regular Xgemv routine + using Xgemv<T>::DoGemv; + + // Constructor + Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV"); + + // Templated-precision implementation of the routine + StatusCode DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGBMV_H_ +#endif diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h index 801b2fc3..311ad9f8 100644 --- a/include/internal/routines/level2/xhemv.h +++ b/include/internal/routines/level2/xhemv.h @@ -8,7 +8,9 @@ // Cedric Nugteren <www.cedricnugteren.nl> // // This file implements the Xhemv routine. It is based on the generalized matrix multiplication -// routine (Xgemv). The implementation is very similar to the Xsymv routine. +// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the +// "DoGemm" function directly. The "DoHemv" function first preprocesses the hermetian matrix by +// transforming it into a general matrix, and then calls the regular GEMV code. // // ================================================================================================= diff --git a/include/internal/utilities.h b/include/internal/utilities.h index 466ac810..7ed84efb 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -41,6 +41,8 @@ const std::string kKhronosDoublePrecision = "cl_khr_fp64"; constexpr auto kArgM = "m"; constexpr auto kArgN = "n"; constexpr auto kArgK = "k"; +constexpr auto kArgKL = "kl"; +constexpr auto kArgKU = "ku"; constexpr auto kArgLayout = "layout"; constexpr auto kArgATransp = "transA"; constexpr auto kArgBTransp = "transB"; @@ -87,9 +89,11 @@ constexpr auto kArgNoAbbreviations = "no_abbrv"; template <typename T> struct Arguments { // Routine-specific arguments - size_t m = 0; - size_t n = 0; - size_t k = 0; + size_t m = 1; + size_t n = 1; + size_t k = 1; + size_t ku = 1; + size_t kl = 1; Layout layout = Layout::kRowMajor; Transpose a_transpose = Transpose::kNo; Transpose b_transpose = Transpose::kNo; @@ -100,9 +104,9 @@ struct Arguments { size_t y_inc = 1; size_t x_offset = 0; size_t y_offset = 0; - size_t a_ld = 0; - size_t b_ld = 0; - size_t c_ld = 0; + size_t a_ld = 1; + size_t b_ld = 1; + size_t c_ld = 1; size_t a_offset = 0; size_t b_offset = 0; size_t c_offset = 0; diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 677c8afc..d1171bc0 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -64,7 +64,7 @@ routines = [ ], [ # Level 2: matrix-vector Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"), - Routine(False, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"), + Routine(True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"), Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"), Routine(False, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"), Routine(False, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"), @@ -237,7 +237,7 @@ files = [ path_clblast+"/src/clblast_c.cc", path_clblast+"/test/wrapper_clblas.h", ] -header_lines = [84, 44, 80, 24, 22] +header_lines = [84, 45, 80, 24, 22] footer_lines = [6, 3, 5, 2, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/src/clblast.cc b/src/clblast.cc index a0dd8c70..ad5e354d 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -28,6 +28,7 @@ // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" +#include "internal/routines/level2/xgbmv.h" #include "internal/routines/level2/xhemv.h" #include "internal/routines/level2/xsymv.h" @@ -327,15 +328,26 @@ template StatusCode Gemv<double2>(const Layout, const Transpose, // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV template <typename T> -StatusCode Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const T, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const T, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xgbmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoGbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Gbmv<float>(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, diff --git a/src/kernels/xgemv.opencl b/src/kernels/level2/xgemv.opencl index 1e12dd78..1e12dd78 100644 --- a/src/kernels/xgemv.opencl +++ b/src/kernels/level2/xgemv.opencl diff --git a/src/kernels/matrix_transforms/gbgemt.opencl b/src/kernels/matrix_transforms/gbgemt.opencl new file mode 100644 index 00000000..e46e3a59 --- /dev/null +++ b/src/kernels/matrix_transforms/gbgemt.opencl @@ -0,0 +1,60 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the general banded (gb) to general (ge) matrix transforms. +// +// This kernel uses the matrix-transforms common tuning parameters. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_GBMV) + +// Kernel to transform a general banded matrix into a general matrix +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void GeneralBandedToGeneral(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const int layout, + const int kl, const int ku) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_two && id_one < dest_one) { + real result; + SetToZero(result); + const int k = ku - id_two + id_one; + if ((id_one >= id_two - ku) && (id_one < id_two + kl + 1)) { + result = src[id_two*src_ld + k + src_offset]; + } + dest[id_two*dest_ld + id_one + dest_offset] = result; + } + } + } +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/matrix_transforms/transforms.opencl b/src/kernels/matrix_transforms/transforms.opencl new file mode 100644 index 00000000..01889a13 --- /dev/null +++ b/src/kernels/matrix_transforms/transforms.opencl @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the common functions and parameters specific for matrix-transform kernels. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef PAD_DIMX + #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef PAD_DIMY + #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef PAD_WPTX + #define PAD_WPTX 1 // Work per thread in the first dimension (x) +#endif +#ifndef PAD_WPTY + #define PAD_WPTY 1 // Work per thread in the second dimension (y) +#endif + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc new file mode 100644 index 00000000..eac208b3 --- /dev/null +++ b/src/routines/level2/xgbmv.cc @@ -0,0 +1,117 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xgbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xgbmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0 || m == 0) { return StatusCode::kInvalidDimension; } + + // + auto rotated = (layout == Layout::kRowMajor); + auto t_one = (rotated) ? n : m; + auto t_two = (rotated) ? m : n; + auto a_one = kl+ku+1; + auto a_two = (rotated) ? m : n; + + // Checks for validity of the A matrix + auto status = StatusCode::kSuccess; + if (a_ld < a_one) { return StatusCode::kInvalidLeadDimA; } + try { + auto required_size = (a_ld*a_two + a_offset)*sizeof(T); + auto buffer_size = a_buffer.GetSize(); + if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; } + } catch (...) { return StatusCode::kInvalidMatrixA; } + + // Temporary buffer to generalize the input matrix + try { + auto t_buffer = Buffer<T>(context_, t_one*t_two); + + // Creates a general matrix from the input to be able to run the regular Xgemv routine + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, "GeneralBandedToGeneral"); + + // Sets the arguments for the matrix transform kernel + kernel.SetArgument(0, static_cast<int>(a_one)); + kernel.SetArgument(1, static_cast<int>(a_two)); + kernel.SetArgument(2, static_cast<int>(a_ld)); + kernel.SetArgument(3, static_cast<int>(a_offset)); + kernel.SetArgument(4, a_buffer()); + kernel.SetArgument(5, static_cast<int>(t_one)); + kernel.SetArgument(6, static_cast<int>(t_two)); + kernel.SetArgument(7, static_cast<int>(t_one)); + kernel.SetArgument(8, static_cast<int>(0)); + kernel.SetArgument(9, t_buffer()); + kernel.SetArgument(10, static_cast<int>(layout)); + if (rotated) { + kernel.SetArgument(11, static_cast<int>(ku)); + kernel.SetArgument(12, static_cast<int>(kl)); + } + else { + kernel.SetArgument(11, static_cast<int>(kl)); + kernel.SetArgument(12, static_cast<int>(ku)); + } + + // Uses the common matrix-transforms thread configuration + auto global = std::vector<size_t>{Ceil(CeilDiv(t_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(t_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; + auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the regular Xgemv code + status = DoGemv(layout, a_transpose, m, n, alpha, + t_buffer, 0, t_one, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc); + + // Return the status of the Xgemv routine + return status; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgbmv<float>; +template class Xgbmv<double>; +template class Xgbmv<float2>; +template class Xgbmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index f95a9957..e52d2f20 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -32,8 +32,10 @@ template <typename T> Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name): Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) { source_string_ = - #include "../../kernels/pad.opencl" // For {Herm,Symm}{Upper,Lower}ToSquared (for HEMV/SYMV) - #include "../../kernels/xgemv.opencl" + #include "../../kernels/pad.opencl" // TODO: replace + #include "../../kernels/matrix_transforms/transforms.opencl" + #include "../../kernels/matrix_transforms/gbgemt.opencl" + #include "../../kernels/level2/xgemv.opencl" ; } diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc index 3d6fe595..6a066518 100644 --- a/src/tuning/xgemv.cc +++ b/src/tuning/xgemv.cc @@ -34,7 +34,7 @@ class TuneXgemv { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/xgemv.opencl" + #include "../src/kernels/level2/xgemv.opencl" ; } diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index 9e1d110c..d4e6f24b 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -49,6 +49,7 @@ class TestBlas: public Tester<T,U> { const std::vector<size_t> kIncrements = { 1, 2, 7 }; const std::vector<size_t> kMatrixDims = { 7, 64 }; const std::vector<size_t> kMatrixVectorDims = { 61, 512 }; + const std::vector<size_t> kBandSizes = { 4, 19 }; const std::vector<size_t> kOffsets = GetOffsets(); const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_); const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_); @@ -121,6 +122,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name auto ms = std::vector<size_t>{args.m}; auto ns = std::vector<size_t>{args.n}; auto ks = std::vector<size_t>{args.k}; + auto kus = std::vector<size_t>{args.ku}; + auto kls = std::vector<size_t>{args.kl}; auto layouts = std::vector<Layout>{args.layout}; auto a_transposes = std::vector<Transpose>{args.a_transpose}; auto b_transposes = std::vector<Transpose>{args.b_transpose}; @@ -156,6 +159,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name if (option == kArgM) { ms = dimensions; } if (option == kArgN) { ns = dimensions; } if (option == kArgK) { ks = dimensions; } + if (option == kArgKU) { kus = tester.kBandSizes; } + if (option == kArgKL) { kls = tester.kBandSizes; } if (option == kArgLayout) { layouts = tester.kLayouts; } if (option == kArgATransp) { a_transposes = C::GetATransposes(tester.kTransposes); } if (option == kArgBTransp) { b_transposes = C::GetBTransposes(tester.kTransposes); } @@ -197,21 +202,25 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name for (auto &m: ms) { r_args.m = m; for (auto &n: ns) { r_args.n = n; for (auto &k: ks) { r_args.k = k; - for (auto &x_inc: x_incs) { r_args.x_inc = x_inc; - for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset; - for (auto &y_inc: y_incs) { r_args.y_inc = y_inc; - for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset; - for (auto &a_ld: a_lds) { r_args.a_ld = a_ld; - for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset; - for (auto &b_ld: b_lds) { r_args.b_ld = b_ld; - for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset; - for (auto &c_ld: c_lds) { r_args.c_ld = c_ld; - for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset; - for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset; - for (auto &alpha: alphas) { r_args.alpha = alpha; - for (auto &beta: betas) { r_args.beta = beta; - C::SetSizes(r_args); - regular_test_vector.push_back(r_args); + for (auto &ku: kus) { r_args.ku = ku; + for (auto &kl: kls) { r_args.kl = kl; + for (auto &x_inc: x_incs) { r_args.x_inc = x_inc; + for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset; + for (auto &y_inc: y_incs) { r_args.y_inc = y_inc; + for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset; + for (auto &a_ld: a_lds) { r_args.a_ld = a_ld; + for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset; + for (auto &b_ld: b_lds) { r_args.b_ld = b_ld; + for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset; + for (auto &c_ld: c_lds) { r_args.c_ld = c_ld; + for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset; + for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset; + for (auto &alpha: alphas) { r_args.alpha = alpha; + for (auto &beta: betas) { r_args.beta = beta; + C::SetSizes(r_args); + regular_test_vector.push_back(r_args); + } + } } } } @@ -232,7 +241,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name // Creates the arguments vector for the invalid-buffer tests auto invalid_test_vector = std::vector<Arguments<U>>{}; auto i_args = args; - i_args.m = i_args.n = i_args.k = tester.kBufferSize; + i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize; i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize; for (auto &x_size: x_sizes) { i_args.x_size = x_size; for (auto &y_size: y_sizes) { i_args.y_size = y_size; diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index f792925e..b1a58102 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -132,6 +132,8 @@ void Tester<T,U>::TestEnd() { if (o == kArgM) { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); } if (o == kArgN) { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); } if (o == kArgK) { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); } + if (o == kArgKU) { fprintf(stdout, "%s=%lu ", kArgKU, entry.args.ku); } + if (o == kArgKL) { fprintf(stdout, "%s=%lu ", kArgKL, entry.args.kl); } if (o == kArgLayout) { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);} if (o == kArgATransp) { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);} if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);} diff --git a/test/performance/client.cc b/test/performance/client.cc index 9faa4dca..a28ec5fd 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -48,9 +48,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric for (auto &o: options_) { // Data-sizes - if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); } - if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); } - if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); } + if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); } + if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); } + if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); } + if (o == kArgKU) { args.ku = GetArgument(argc, argv, help, kArgKU, 128UL); } + if (o == kArgKL) { args.kl = GetArgument(argc, argv, help, kArgKL, 128UL); } // Data-layouts if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); } @@ -225,8 +227,10 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas auto integers = std::vector<size_t>{}; for (auto &o: options_) { if (o == kArgM) { integers.push_back(args.m); } - if (o == kArgN) { integers.push_back(args.n); } + else if (o == kArgN) { integers.push_back(args.n); } else if (o == kArgK) { integers.push_back(args.k); } + else if (o == kArgKU) { integers.push_back(args.ku); } + else if (o == kArgKL) { integers.push_back(args.kl); } else if (o == kArgLayout) { integers.push_back(static_cast<size_t>(args.layout)); } else if (o == kArgSide) { integers.push_back(static_cast<size_t>(args.side)); } else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); } diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h new file mode 100644 index 00000000..0e238804 --- /dev/null +++ b/test/routines/level2/xgbmv.h @@ -0,0 +1,140 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the Xgbmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XGBMV_H_ +#define CLBLAST_TEST_ROUTINES_XGBMV_H_ + +#include <vector> +#include <string> + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXgbmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgM, kArgN, kArgKL, kArgKU, + kArgLayout, kArgATransp, + kArgALeadDim, kArgXInc, kArgYInc, + kArgAOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments<T> &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + auto n_real = (a_transposed) ? args.m : args.n; + return n_real * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments<T> &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + auto m_real = (a_transposed) ? args.n : args.m; + return m_real * args.y_inc + args.y_offset; + } + static size_t GetSizeA(const Arguments<T> &args) { + auto a_rotated = (args.layout == Layout::kRowMajor); + auto a_two = (a_rotated) ? args.m : args.n; + return a_two * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &args) { return args.n; } + static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gbmv(args.layout, args.a_transpose, + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout), + static_cast<clblasTranspose>(args.a_transpose), + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.y_size, static_cast<T>(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + return (a_transposed) ? args.n : args.m; + } + static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + return 2 * args.m * args.n; + } + static size_t GetBytes(const Arguments<T> &args) { + auto a_rotated = (args.layout == Layout::kRowMajor); + auto a_one = (a_rotated) ? args.n : args.m; + auto a_two = (a_rotated) ? args.m : args.n; + return ((args.kl+args.ku+1)*a_two + 2*a_one + a_two) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XGBMV_H_ +#endif |