diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2015-09-26 17:02:34 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2015-09-26 17:02:34 +0200 |
commit | 92b4b0d1feaaf92e160fa0342daf4269f24fb4d2 (patch) | |
tree | 3356b67a281d8292e893028d74a1801554ce0ef2 | |
parent | 42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff) | |
parent | 2b56c2c60325f02bc695cbb968049cc09205c713 (diff) |
Merge pull request #27 from CNugteren/level2_matrix_vector
Added many level-2 matrix-vector routines
47 files changed, 2585 insertions, 296 deletions
@@ -1,3 +1,4 @@ build stash -.*
\ No newline at end of file +.* +*.pyc
\ No newline at end of file @@ -1,5 +1,6 @@ Development version (next release) +- Improved structure and performance of level-2 routines (xSYMV/xHEMV) - Added level-1 routines: * SSWAP/DSWAP/CSWAP/ZSWAP * SSCAL/DSCAL/CSCAL/ZSCAL @@ -7,6 +8,15 @@ Development version (next release) * SDOT/DDOT * CDOTU/ZDOTU * CDOTC/ZDOTC +- Added level-2 routines: + * SGBMV/DGBMV/CGBMV/ZGBMV + * CHBMV/ZHBMV + * CHPMV/ZHPMV + * SSBMV/DSBMV + * SSPMV/DSPMV + * STRMV/DTRMV/CTRMV/ZTRMV + * STBMV/DTBMV/CTBMV/ZTBMV + * STPMV/DTPMV/CTPMV/ZTPMV Version 0.4.0 - Now using the Claduc C++11 interface to OpenCL diff --git a/CMakeLists.txt b/CMakeLists.txt index 1960bf1d..1ddd2f77 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,7 +106,7 @@ set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm) set(SAMPLE_PROGRAMS_CPP sgemm) set(SAMPLE_PROGRAMS_C sgemm) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc) -set(LEVEL2_ROUTINES xgemv xhemv xsymv) +set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES}) set(PRECISIONS 32 3232 64 6464) @@ -6,7 +6,7 @@ CLBlast: The tuned OpenCL BLAS library CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. -__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details. +__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details. Why CLBlast and not clBLAS or cuBLAS? @@ -130,7 +130,7 @@ These graphs can be generated automatically on your own device. First, compile C Supported routines ------------- -CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables: +CLBlast is in active development but already supports the majority of BLAS routines. The currently supported routines are marked with '✔' in the following tables: | Level-1 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| @@ -153,16 +153,16 @@ CLBlast is in active development and currently does not support the full set of | Level-2 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| | xGEMV | ✔ | ✔ | ✔ | ✔ | | -| xGBMV | | | | | | +| xGBMV | ✔ | ✔ | ✔ | ✔ | | | xHEMV | - | - | ✔ | ✔ | | -| xHBMV | - | - | | | | -| xHPMV | - | - | | | | +| xHBMV | - | - | ✔ | ✔ | | +| xHPMV | - | - | ✔ | ✔ | | | xSYMV | ✔ | ✔ | - | - | | -| xSBMV | | | - | - | | -| xSPMV | | | - | - | | -| xTRMV | | | | | | -| xTBMV | | | | | | -| xTPMV | | | | | | +| xSBMV | ✔ | ✔ | - | - | | +| xSPMV | ✔ | ✔ | - | - | | +| xTRMV | ✔ | ✔ | ✔ | ✔ | | +| xTBMV | ✔ | ✔ | ✔ | ✔ | | +| xTPMV | ✔ | ✔ | ✔ | ✔ | | | xTRSV | | | | | | | xTBSV | | | | | | | xTPSV | | | | | | diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index 2c2cc797..df7a0d82 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -493,11 +493,11 @@ class Buffer { } // Copies the contents of this buffer into another device buffer - void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) { + void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const { CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0, nullptr, nullptr)); } - void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) { + void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const { CopyToAsync(queue, size, destination); queue.Finish(); } diff --git a/include/internal/routine.h b/include/internal/routine.h index c5b253b4..b7c06a97 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -72,6 +72,8 @@ class Routine { const size_t offset, const size_t ld, const size_t data_size); StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer, const size_t offset, const size_t ld, const size_t data_size); + StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, + const size_t offset, const size_t data_size); // Tests for valid inputs of vectors X and Y StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset, diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h new file mode 100644 index 00000000..27b033e9 --- /dev/null +++ b/include/internal/routines/level2/xgbmv.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xgbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XGBMV_H_ +#define CLBLAST_ROUTINES_XGBMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xgbmv: public Xgemv<T> { + public: + + // Uses the generic matrix-vector routine + using Xgemv<T>::MatVec; + + // Constructor + Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV"); + + // Templated-precision implementation of the routine + StatusCode DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XGBMV_H_ +#endif diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h index 1e120a5e..b31565ec 100644 --- a/include/internal/routines/level2/xgemv.h +++ b/include/internal/routines/level2/xgemv.h @@ -32,6 +32,7 @@ class Xgemv: public Routine<T> { using Routine<T>::TestVectorX; using Routine<T>::TestVectorY; using Routine<T>::TestMatrixA; + using Routine<T>::TestMatrixAP; using Routine<T>::RunKernel; using Routine<T>::ErrorIn; @@ -47,6 +48,18 @@ class Xgemv: public Routine<T> { const T beta, const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); + // Generic version used also for other matrix-vector multiplications + StatusCode MatVec(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + bool fast_kernel, bool fast_kernel_rot, + const size_t parameter, const bool packed, + const size_t kl, const size_t ku); + private: // Static variable to get the precision const static Precision precision_; diff --git a/include/internal/routines/level2/xhbmv.h b/include/internal/routines/level2/xhbmv.h new file mode 100644 index 00000000..65138424 --- /dev/null +++ b/include/internal/routines/level2/xhbmv.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xhbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xhbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHBMV_H_ +#define CLBLAST_ROUTINES_XHBMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xhbmv: public Xgemv<T> { + public: + + // Uses the generic matrix-vector routine + using Xgemv<T>::MatVec; + + // Constructor + Xhbmv(Queue &queue, Event &event, const std::string &name = "HBMV"); + + // Templated-precision implementation of the routine + StatusCode DoHbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHBMV_H_ +#endif diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h index 801b2fc3..b74db760 100644 --- a/include/internal/routines/level2/xhemv.h +++ b/include/internal/routines/level2/xhemv.h @@ -7,8 +7,9 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file implements the Xhemv routine. It is based on the generalized matrix multiplication -// routine (Xgemv). The implementation is very similar to the Xsymv routine. +// This file implements the Xhemv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. // // ================================================================================================= @@ -25,16 +26,8 @@ template <typename T> class Xhemv: public Xgemv<T> { public: - // Members and methods from the base class - using Routine<T>::db_; - using Routine<T>::context_; - using Routine<T>::GetProgramFromCache; - using Routine<T>::TestMatrixA; - using Routine<T>::RunKernel; - using Routine<T>::ErrorIn; - - // Uses the regular Xgemv routine - using Xgemv<T>::DoGemv; + // Uses the generic matrix-vector routine + using Xgemv<T>::MatVec; // Constructor Xhemv(Queue &queue, Event &event, const std::string &name = "HEMV"); diff --git a/include/internal/routines/level2/xhpmv.h b/include/internal/routines/level2/xhpmv.h new file mode 100644 index 00000000..48f1ed3f --- /dev/null +++ b/include/internal/routines/level2/xhpmv.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xhpmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xhpmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XHPMV_H_ +#define CLBLAST_ROUTINES_XHPMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xhpmv: public Xgemv<T> { + public: + + // Uses the generic matrix-vector routine + using Xgemv<T>::MatVec; + + // Constructor + Xhpmv(Queue &queue, Event &event, const std::string &name = "HPMV"); + + // Templated-precision implementation of the routine + StatusCode DoHpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XHPMV_H_ +#endif diff --git a/include/internal/routines/level2/xsbmv.h b/include/internal/routines/level2/xsbmv.h new file mode 100644 index 00000000..bb24d8f4 --- /dev/null +++ b/include/internal/routines/level2/xsbmv.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xsbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xsbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSBMV_H_ +#define CLBLAST_ROUTINES_XSBMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xsbmv: public Xgemv<T> { + public: + + // Uses the generic matrix-vector routine + using Xgemv<T>::MatVec; + + // Constructor + Xsbmv(Queue &queue, Event &event, const std::string &name = "SBMV"); + + // Templated-precision implementation of the routine + StatusCode DoSbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSBMV_H_ +#endif diff --git a/include/internal/routines/level2/xspmv.h b/include/internal/routines/level2/xspmv.h new file mode 100644 index 00000000..88f02a2f --- /dev/null +++ b/include/internal/routines/level2/xspmv.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xspmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xspmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSPMV_H_ +#define CLBLAST_ROUTINES_XSPMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xspmv: public Xgemv<T> { + public: + + // Uses the generic matrix-vector routine + using Xgemv<T>::MatVec; + + // Constructor + Xspmv(Queue &queue, Event &event, const std::string &name = "SPMV"); + + // Templated-precision implementation of the routine + StatusCode DoSpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSPMV_H_ +#endif diff --git a/include/internal/routines/level2/xsymv.h b/include/internal/routines/level2/xsymv.h index ab6da6d1..c7b92702 100644 --- a/include/internal/routines/level2/xsymv.h +++ b/include/internal/routines/level2/xsymv.h @@ -9,8 +9,7 @@ // // This file implements the Xsymv routine. It is based on the generalized mat-vec multiplication // routine (Xgemv). The Xsymv class inherits from the templated class Xgemv, allowing it to call the -// "DoGemm" function directly. The "DoSymv" function first preprocesses the symmetric matrix by -// transforming it into a general matrix, and then calls the regular GEMV code. +// "MatVec" function directly. // // ================================================================================================= @@ -27,16 +26,8 @@ template <typename T> class Xsymv: public Xgemv<T> { public: - // Members and methods from the base class - using Routine<T>::db_; - using Routine<T>::context_; - using Routine<T>::GetProgramFromCache; - using Routine<T>::TestMatrixA; - using Routine<T>::RunKernel; - using Routine<T>::ErrorIn; - - // Uses the regular Xgemv routine - using Xgemv<T>::DoGemv; + // Uses the generic matrix-vector routine + using Xgemv<T>::MatVec; // Constructor Xsymv(Queue &queue, Event &event, const std::string &name = "SYMV"); diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h new file mode 100644 index 00000000..89c90193 --- /dev/null +++ b/include/internal/routines/level2/xtbmv.h @@ -0,0 +1,51 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTBMV_H_ +#define CLBLAST_ROUTINES_XTBMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xtbmv: public Xgemv<T> { + public: + + // Members from the base class + using Routine<T>::queue_; + using Routine<T>::context_; + + // Uses the generic matrix-vector routine + using Xgemv<T>::MatVec; + + // Constructor + Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV"); + + // Templated-precision implementation of the routine + StatusCode DoTbmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTBMV_H_ +#endif diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h new file mode 100644 index 00000000..183d3505 --- /dev/null +++ b/include/internal/routines/level2/xtpmv.h @@ -0,0 +1,51 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTPMV_H_ +#define CLBLAST_ROUTINES_XTPMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xtpmv: public Xgemv<T> { + public: + + // Members from the base class + using Routine<T>::queue_; + using Routine<T>::context_; + + // Uses the generic matrix-vector routine + using Xgemv<T>::MatVec; + + // Constructor + Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV"); + + // Templated-precision implementation of the routine + StatusCode DoTpmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTPMV_H_ +#endif diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h new file mode 100644 index 00000000..dadfbc98 --- /dev/null +++ b/include/internal/routines/level2/xtrmv.h @@ -0,0 +1,51 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication +// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the +// "MatVec" function directly. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTRMV_H_ +#define CLBLAST_ROUTINES_XTRMV_H_ + +#include "internal/routines/level2/xgemv.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xtrmv: public Xgemv<T> { + public: + + // Members from the base class + using Routine<T>::queue_; + using Routine<T>::context_; + + // Uses the generic matrix-vector routine + using Xgemv<T>::MatVec; + + // Constructor + Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV"); + + // Templated-precision implementation of the routine + StatusCode DoTrmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTRMV_H_ +#endif diff --git a/include/internal/utilities.h b/include/internal/utilities.h index 466ac810..bd174ccb 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -41,6 +41,8 @@ const std::string kKhronosDoublePrecision = "cl_khr_fp64"; constexpr auto kArgM = "m"; constexpr auto kArgN = "n"; constexpr auto kArgK = "k"; +constexpr auto kArgKL = "kl"; +constexpr auto kArgKU = "ku"; constexpr auto kArgLayout = "layout"; constexpr auto kArgATransp = "transA"; constexpr auto kArgBTransp = "transB"; @@ -57,6 +59,7 @@ constexpr auto kArgCLeadDim = "ldc"; constexpr auto kArgAOffset = "offa"; constexpr auto kArgBOffset = "offb"; constexpr auto kArgCOffset = "offc"; +constexpr auto kArgAPOffset = "offap"; constexpr auto kArgDotOffset = "offdot"; constexpr auto kArgAlpha = "alpha"; constexpr auto kArgBeta = "beta"; @@ -87,9 +90,11 @@ constexpr auto kArgNoAbbreviations = "no_abbrv"; template <typename T> struct Arguments { // Routine-specific arguments - size_t m = 0; - size_t n = 0; - size_t k = 0; + size_t m = 1; + size_t n = 1; + size_t k = 1; + size_t ku = 1; + size_t kl = 1; Layout layout = Layout::kRowMajor; Transpose a_transpose = Transpose::kNo; Transpose b_transpose = Transpose::kNo; @@ -100,12 +105,13 @@ struct Arguments { size_t y_inc = 1; size_t x_offset = 0; size_t y_offset = 0; - size_t a_ld = 0; - size_t b_ld = 0; - size_t c_ld = 0; + size_t a_ld = 1; + size_t b_ld = 1; + size_t c_ld = 1; size_t a_offset = 0; size_t b_offset = 0; size_t c_offset = 0; + size_t ap_offset = 0; size_t dot_offset = 0; T alpha = T{1.0}; T beta = T{1.0}; @@ -114,6 +120,7 @@ struct Arguments { size_t a_size = 1; size_t b_size = 1; size_t c_size = 1; + size_t ap_size = 1; size_t dot_size = 1; // Tuner-specific arguments double fraction = 1.0; @@ -141,6 +148,7 @@ struct Buffers { Buffer<T> a_mat; Buffer<T> b_mat; Buffer<T> c_mat; + Buffer<T> ap_mat; Buffer<T> dot; }; diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 677c8afc..25f02861 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -64,16 +64,16 @@ routines = [ ], [ # Level 2: matrix-vector Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"), - Routine(False, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"), + Routine(True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"), Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"), - Routine(False, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"), - Routine(False, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"), + Routine(True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"), + Routine(True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"), Routine(True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"), - Routine(False, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"), - Routine(False, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"), - Routine(False, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"), - Routine(False, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"), - Routine(False, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"), + Routine(True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"), + Routine(True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"), + Routine(True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"), + Routine(True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"), + Routine(True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"), Routine(False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"), Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"), Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"), @@ -213,7 +213,7 @@ def wrapper_clblas(routines): if routine.scratch: result += " auto queue = Queue(queues[0]);\n" result += " auto context = queue.GetContext();\n" - result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, n);\n" + result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, n*x_inc + x_offset);\n" arguments += ["scratch_buffer()"] result += " return clblas"+flavour.name+routine.name+"(" result += (",\n"+indent).join([a for a in arguments]) @@ -237,7 +237,7 @@ files = [ path_clblast+"/src/clblast_c.cc", path_clblast+"/test/wrapper_clblas.h", ] -header_lines = [84, 44, 80, 24, 22] +header_lines = [84, 52, 80, 24, 22] footer_lines = [6, 3, 5, 2, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/src/clblast.cc b/src/clblast.cc index a0dd8c70..77999aaf 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -28,8 +28,16 @@ // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" +#include "internal/routines/level2/xgbmv.h" #include "internal/routines/level2/xhemv.h" +#include "internal/routines/level2/xhbmv.h" +#include "internal/routines/level2/xhpmv.h" #include "internal/routines/level2/xsymv.h" +#include "internal/routines/level2/xsbmv.h" +#include "internal/routines/level2/xspmv.h" +#include "internal/routines/level2/xtrmv.h" +#include "internal/routines/level2/xtbmv.h" +#include "internal/routines/level2/xtpmv.h" // BLAS level-3 includes #include "internal/routines/level3/xgemm.h" @@ -327,15 +335,26 @@ template StatusCode Gemv<double2>(const Layout, const Transpose, // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV template <typename T> -StatusCode Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const T, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const T, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xgbmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoGbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Gbmv<float>(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, @@ -412,15 +431,26 @@ template StatusCode Hemv<double2>(const Layout, const Triangle, // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template <typename T> -StatusCode Hbmv(const Layout, const Triangle, - const size_t, const size_t, - const T, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const T, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xhbmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHbmv(layout, triangle, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Hbmv<float2>(const Layout, const Triangle, const size_t, const size_t, @@ -441,15 +471,26 @@ template StatusCode Hbmv<double2>(const Layout, const Triangle, // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template <typename T> -StatusCode Hpmv(const Layout, const Triangle, - const size_t, - const T, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const T, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xhpmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHpmv(layout, triangle, + n, + alpha, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Hpmv<float2>(const Layout, const Triangle, const size_t, @@ -510,15 +551,26 @@ template StatusCode Symv<double>(const Layout, const Triangle, // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV template <typename T> -StatusCode Sbmv(const Layout, const Triangle, - const size_t, const size_t, - const T, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const T, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xsbmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSbmv(layout, triangle, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Sbmv<float>(const Layout, const Triangle, const size_t, const size_t, @@ -539,15 +591,26 @@ template StatusCode Sbmv<double>(const Layout, const Triangle, // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV template <typename T> -StatusCode Spmv(const Layout, const Triangle, - const size_t, - const T, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const T, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xspmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSpmv(layout, triangle, + n, + alpha, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Spmv<float>(const Layout, const Triangle, const size_t, @@ -568,12 +631,20 @@ template StatusCode Spmv<double>(const Layout, const Triangle, // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV template <typename T> -StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xtrmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoTrmv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); } template StatusCode Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, @@ -598,12 +669,20 @@ template StatusCode Trmv<double2>(const Layout, const Triangle, const Transpose, // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV template <typename T> -StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xtbmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoTbmv(layout, triangle, a_transpose, diagonal, + n, k, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); } template StatusCode Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, @@ -628,12 +707,20 @@ template StatusCode Tbmv<double2>(const Layout, const Triangle, const Transpose, // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV template <typename T> -StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xtpmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoTpmv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc); } template StatusCode Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, diff --git a/src/kernels/xgemv.opencl b/src/kernels/level2/xgemv.opencl index 1e12dd78..8ed0e9e4 100644 --- a/src/kernels/xgemv.opencl +++ b/src/kernels/level2/xgemv.opencl @@ -79,22 +79,189 @@ R"( #endif // ================================================================================================= -// Defines how to load the input matrix in the regular case -// Loads a scalar input value +// Defines how to load the input matrix in the non-vectorized case inline real LoadMatrixA(const __global real* restrict agm, const int x, const int y, - const int a_ld, const int a_offset) { - return agm[x + a_ld*y + a_offset]; + const int a_ld, const int a_offset, const int parameter, + const int kl, const int ku) { + real result; + + // For banded matrices + #if defined(ROUTINE_GBMV) + const int k = ku - y; + if (x >= y-ku && x < y+kl+1) { result = agm[a_ld*y + k + x + a_offset]; } + else { SetToZero(result); } + + // For symmetric/hermitian matrices + #elif defined(ROUTINE_HEMV) || defined(ROUTINE_SYMV) + if ((parameter == 0 && y <= x) || (parameter == 1 && x <= y)) { + result = agm[a_ld*y + x + a_offset]; + #if defined(ROUTINE_HEMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + result = agm[a_ld*x + y + a_offset]; + #if defined(ROUTINE_HEMV) + COMPLEX_CONJUGATE(result); + #endif + } + + // For triangular matrices + #elif defined(ROUTINE_TRMV) + if (((parameter == 0 || parameter == 2) && y <= x) || + ((parameter == 1 || parameter == 3) && x <= y)) { + result = agm[a_ld*y + x + a_offset]; + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + + // For symmetric/hermitian banded matrices + #elif defined(ROUTINE_HBMV) || defined(ROUTINE_SBMV) + if (parameter == 1) { + if (x <= y) { + const int m = kl - y; + if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + const int m = kl - x; + if (y >= x-kl && y <= x) { result = agm[a_ld*x + m + y + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + else { + if (x >= y) { + const int m = -y; + if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + const int m = -x; + if (y >= x && y < x+kl+1) { result = agm[a_ld*x + m + y + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + + // For triangular banded matrices + #elif defined(ROUTINE_TBMV) + if (parameter == 1 || parameter == 3) { + if (x <= y) { + const int m = kl - y; + if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + else { + if (x >= y) { + const int m = -y; + if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + + // For symmetric/hermitian packed matrices + #elif defined(ROUTINE_HPMV) || defined(ROUTINE_SPMV) + if (parameter == 1) { + if (x <= y) { + result = agm[((y+1)*y)/2 + x + a_offset]; + #if defined(ROUTINE_HPMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + result = agm[((x+1)*x)/2 + y + a_offset]; + #if defined(ROUTINE_HPMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + else { + if (x >= y) { + result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset]; + #if defined(ROUTINE_HPMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + result = agm[((2*a_ld-(x+1))*x)/2 + y + a_offset]; + #if defined(ROUTINE_HPMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + + // For triangular packed matrices + #elif defined(ROUTINE_TPMV) + if (parameter == 1 || parameter == 3) { + if (x <= y) { + result = agm[((y+1)*y)/2 + x + a_offset]; + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + else { + if (x >= y) { + result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset]; + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + + // For general matrices + #else + result = agm[a_ld*y + x + a_offset]; + #endif + + return result; } + // Loads a vector input value (1/2) inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y, const int a_ld) { - return agm[x + a_ld*y]; + return agm[a_ld*y + x]; } + // Loads a vector input value (2/2): as before, but different data-type inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y, const int a_ld) { - return agm[x + a_ld*y]; + return agm[a_ld*y + x]; } // ================================================================================================= @@ -106,7 +273,8 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, const __global real* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate) { + const int do_conjugate, const int parameter, + const int kl, const int ku) { // Local memory for the vector X __local real xlm[WGS1]; @@ -141,20 +309,20 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, // The multiply-add function for the main part (divisable by WGS1) if (a_rotated == 0) { // Not rotated #pragma unroll - for (int kl=0; kl<WGS1; ++kl) { - const int k = kwg + kl; - real value = LoadMatrixA(agm, gid, k, a_ld, a_offset); + for (int kloop=0; kloop<WGS1; ++kloop) { + const int k = kwg + kloop; + real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku); if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } - MultiplyAdd(acc[w], xlm[kl], value); + MultiplyAdd(acc[w], xlm[kloop], value); } } else { // Transposed #pragma unroll - for (int kl=0; kl<WGS1; ++kl) { - const int k = kwg + kl; - real value = LoadMatrixA(agm, k, gid, a_ld, a_offset); + for (int kloop=0; kloop<WGS1; ++kloop) { + const int k = kwg + kloop; + real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku); if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } - MultiplyAdd(acc[w], xlm[kl], value); + MultiplyAdd(acc[w], xlm[kloop], value); } } } @@ -174,7 +342,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, if (a_rotated == 0) { // Not rotated #pragma unroll for (int k=n_floor; k<n; ++k) { - real value = LoadMatrixA(agm, gid, k, a_ld, a_offset); + real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku); if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value); } @@ -182,7 +350,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, else { // Transposed #pragma unroll for (int k=n_floor; k<n; ++k) { - real value = LoadMatrixA(agm, k, gid, a_ld, a_offset); + real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku); if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value); } @@ -209,7 +377,8 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b const __global realVF* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate) { + const int do_conjugate, const int parameter, + const int kl, const int ku) { // Local memory for the vector X __local real xlm[WGS2]; @@ -305,7 +474,8 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea const __global realVFR* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate) { + const int do_conjugate, const int parameter, + const int kl, const int ku) { // Local memory for the vector X __local real xlm[WGS3]; diff --git a/src/routine.cc b/src/routine.cc index 05a03683..2978c94a 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -191,6 +191,18 @@ StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buf return StatusCode::kSuccess; } +// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size +template <typename T> +StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer, + const size_t offset, const size_t data_size) { + try { + auto required_size = (((n*(n+1))/2) + offset)*data_size; + auto buffer_size = buffer.GetSize(); + if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; } + } catch (...) { return StatusCode::kInvalidMatrixA; } + return StatusCode::kSuccess; +} + // ================================================================================================= // Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc new file mode 100644 index 00000000..14d391ca --- /dev/null +++ b/src/routines/level2/xgbmv.cc @@ -0,0 +1,67 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xgbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xgbmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Reverses the upper and lower band count + auto rotated = (layout == Layout::kRowMajor); + auto kl_real = (rotated) ? ku : kl; + auto ku_real = (rotated) ? kl : ku; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_GBMV define. + bool fast_kernels = false; + return MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + 0, false, kl_real, ku_real); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgbmv<float>; +template class Xgbmv<double>; +template class Xgbmv<float2>; +template class Xgbmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index f95a9957..1b768dcd 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -32,8 +32,7 @@ template <typename T> Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name): Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) { source_string_ = - #include "../../kernels/pad.opencl" // For {Herm,Symm}{Upper,Lower}ToSquared (for HEMV/SYMV) - #include "../../kernels/xgemv.opencl" + #include "../../kernels/level2/xgemv.opencl" ; } @@ -49,6 +48,31 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, const T beta, const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + // Performs the matrix-vector multiplication + return MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + true, true, + 0, false, 0, 0); // N/A for this routine +} + +// ================================================================================================= + +// The generic implementation, also suited for other (non general) matrix-vector multiplications +template <typename T> +StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + bool fast_kernel, bool fast_kernel_rot, + const size_t parameter, const bool packed, + const size_t kl, const size_t ku) { + // Makes sure all dimensions are larger than zero if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } @@ -62,6 +86,11 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, auto m_real = (a_transposed) ? n : m; auto n_real = (a_transposed) ? m : n; + // Special adjustments for banded matrices + if (kl != 0 || ku != 0) { + a_one = kl+ku+1; + } + // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator) auto a_rotated = a_transposed ^ a_altlayout; @@ -69,7 +98,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, auto a_conjugate = (a_transpose == Transpose::kConjugate); // Tests the matrix and the vectors for validity - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + auto status = StatusCode::kSuccess; + if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); } + else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); } if (ErrorIn(status)) { return status; } status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T)); if (ErrorIn(status)) { return status; } @@ -77,26 +108,26 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, if (ErrorIn(status)) { return status; } // Determines whether or not the fast-version can be used - bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && - IsMultiple(n, db_["WGS2"]) && - IsMultiple(a_ld, db_["VW2"]); - bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && - IsMultiple(n, db_["WGS3"]) && - IsMultiple(a_ld, db_["VW3"]); + fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && + IsMultiple(n, db_["WGS2"]) && + IsMultiple(a_ld, db_["VW2"]); + fast_kernel_rot = fast_kernel_rot && (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && + IsMultiple(n, db_["WGS3"]) && + IsMultiple(a_ld, db_["VW3"]); // If possible, run the fast-version (rotated or non-rotated) of the kernel auto kernel_name = "Xgemv"; auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]); auto global_size = m_ceiled / db_["WPT1"]; auto local_size = db_["WGS1"]; - if (use_fast_kernel) { + if (fast_kernel) { kernel_name = "XgemvFast"; global_size = m_real / db_["WPT2"]; local_size = db_["WGS2"]; } - if (use_fast_kernel_rot) { + if (fast_kernel_rot) { kernel_name = "XgemvFastRot"; global_size = m_real / db_["WPT3"]; local_size = db_["WGS3"]; @@ -123,6 +154,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, kernel.SetArgument(12, static_cast<int>(y_offset)); kernel.SetArgument(13, static_cast<int>(y_inc)); kernel.SetArgument(14, static_cast<int>(a_conjugate)); + kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm + kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices + kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices // Launches the kernel auto global = std::vector<size_t>{global_size}; diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc new file mode 100644 index 00000000..f59a7cb3 --- /dev/null +++ b/src/routines/level2/xhbmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xhbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xhbmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xhbmv<T>::Xhbmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HBMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhbmv<float2>; +template class Xhbmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc index 2d92e45f..5a58b28b 100644 --- a/src/routines/level2/xhemv.cc +++ b/src/routines/level2/xhemv.cc @@ -37,57 +37,21 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle, const T beta, const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; - - // Temporary buffer for a copy of the hermitian matrix - try { - auto temp_herm = Buffer<T>(context_, n*n); - - // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemv - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the hermitian-to-squared kernel - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, static_cast<int>(a_ld)); - kernel.SetArgument(2, static_cast<int>(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast<int>(n)); - kernel.SetArgument(5, static_cast<int>(n)); - kernel.SetArgument(6, static_cast<int>(0)); - kernel.SetArgument(7, temp_herm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // hermitian-to-squared kernel uses the same parameters. - auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemv code - status = DoGemv(layout, Transpose::kNo, n, n, alpha, - temp_herm, 0, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc); - - // Return the status of the Xgemv routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HEMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc new file mode 100644 index 00000000..2269255d --- /dev/null +++ b/src/routines/level2/xhpmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xhpmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xhpmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xhpmv<T>::Xhpmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HPMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhpmv<float2>; +template class Xhpmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc new file mode 100644 index 00000000..457bd762 --- /dev/null +++ b/src/routines/level2/xsbmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xsbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xsbmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xsbmv<T>::Xsbmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SBMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsbmv<float>; +template class Xsbmv<double>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc new file mode 100644 index 00000000..4f1a9c61 --- /dev/null +++ b/src/routines/level2/xspmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xspmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xspmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xspmv<T>::Xspmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SPMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xspmv<float>; +template class Xspmv<double>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc index 2ccb51f6..ec12324b 100644 --- a/src/routines/level2/xsymv.cc +++ b/src/routines/level2/xsymv.cc @@ -37,57 +37,21 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle, const T beta, const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; - - // Temporary buffer for a copy of the symmetric matrix - try { - auto temp_symm = Buffer<T>(context_, n*n); - - // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemv - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the symmetric-to-squared kernel - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, static_cast<int>(a_ld)); - kernel.SetArgument(2, static_cast<int>(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast<int>(n)); - kernel.SetArgument(5, static_cast<int>(n)); - kernel.SetArgument(6, static_cast<int>(0)); - kernel.SetArgument(7, temp_symm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // symmetric-to-squared kernel uses the same parameters. - auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemv code - status = DoGemv(layout, Transpose::kNo, n, n, alpha, - temp_symm, 0, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc); - - // Return the status of the Xgemv routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SYMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc new file mode 100644 index 00000000..2e1aebff --- /dev/null +++ b/src/routines/level2/xtbmv.cc @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xtbmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xtbmv<T>::Xtbmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TBMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, k, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtbmv<float>; +template class Xtbmv<double>; +template class Xtbmv<float2>; +template class Xtbmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc new file mode 100644 index 00000000..aa0e099b --- /dev/null +++ b/src/routines/level2/xtpmv.cc @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtpmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xtpmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xtpmv<T>::Xtpmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TPMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + ap_buffer, ap_offset, n, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, true, 0, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtpmv<float>; +template class Xtpmv<double>; +template class Xtpmv<float2>; +template class Xtpmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc new file mode 100644 index 00000000..94424743 --- /dev/null +++ b/src/routines/level2/xtrmv.cc @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtrmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xtrmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xtrmv<T>::Xtrmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TRMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, 0, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrmv<float>; +template class Xtrmv<double>; +template class Xtrmv<float2>; +template class Xtrmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc index 3d6fe595..6a066518 100644 --- a/src/tuning/xgemv.cc +++ b/src/tuning/xgemv.cc @@ -34,7 +34,7 @@ class TuneXgemv { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/xgemv.opencl" + #include "../src/kernels/level2/xgemv.opencl" ; } diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index 839ac3b1..85e18381 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -57,12 +57,14 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent, a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); + ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset); dot_source_.resize(std::max(max_mat, max_matvec) + max_offset); PopulateVector(x_source_); PopulateVector(y_source_); PopulateVector(a_source_); PopulateVector(b_source_); PopulateVector(c_source_); + PopulateVector(ap_source_); PopulateVector(dot_source_); } @@ -83,14 +85,16 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st auto a_mat1 = Buffer<T>(context_, args.a_size); auto b_mat1 = Buffer<T>(context_, args.b_size); auto c_mat1 = Buffer<T>(context_, args.c_size); + auto ap_mat1 = Buffer<T>(context_, args.ap_size); auto dot1 = Buffer<T>(context_, args.dot_size); x_vec1.Write(queue_, args.x_size, x_source_); y_vec1.Write(queue_, args.y_size, y_source_); a_mat1.Write(queue_, args.a_size, a_source_); b_mat1.Write(queue_, args.b_size, b_source_); c_mat1.Write(queue_, args.c_size, c_source_); + ap_mat1.Write(queue_, args.ap_size, ap_source_); dot1.Write(queue_, args.dot_size, dot_source_); - auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, dot1}; + auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, dot1}; auto status1 = run_reference_(args, buffers1, queue_); // Runs the CLBlast code @@ -99,14 +103,16 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st auto a_mat2 = Buffer<T>(context_, args.a_size); auto b_mat2 = Buffer<T>(context_, args.b_size); auto c_mat2 = Buffer<T>(context_, args.c_size); + auto ap_mat2 = Buffer<T>(context_, args.ap_size); auto dot2 = Buffer<T>(context_, args.dot_size); x_vec2.Write(queue_, args.x_size, x_source_); y_vec2.Write(queue_, args.y_size, y_source_); a_mat2.Write(queue_, args.a_size, a_source_); b_mat2.Write(queue_, args.b_size, b_source_); c_mat2.Write(queue_, args.c_size, c_source_); + ap_mat2.Write(queue_, args.ap_size, ap_source_); dot2.Write(queue_, args.dot_size, dot_source_); - auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, dot2}; + auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, dot2}; auto status2 = run_routine_(args, buffers2, queue_); // Tests for equality of the two status codes @@ -155,29 +161,33 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); + auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.dot_size*sizeof(T), nullptr,nullptr); auto x_vec1 = Buffer<T>(x1); auto y_vec1 = Buffer<T>(y1); auto a_mat1 = Buffer<T>(a1); auto b_mat1 = Buffer<T>(b1); auto c_mat1 = Buffer<T>(c1); + auto ap_mat1 = Buffer<T>(ap1); auto dot1 = Buffer<T>(d1); auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); + auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.dot_size*sizeof(T), nullptr,nullptr); auto x_vec2 = Buffer<T>(x2); auto y_vec2 = Buffer<T>(y2); auto a_mat2 = Buffer<T>(a2); auto b_mat2 = Buffer<T>(b2); auto c_mat2 = Buffer<T>(c2); + auto ap_mat2 = Buffer<T>(ap2); auto dot2 = Buffer<T>(d2); // Runs the two routines - auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, dot1}; - auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, dot2}; + auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, dot1}; + auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, dot2}; auto status1 = run_reference_(args, buffers1, queue_); auto status2 = run_routine_(args, buffers2, queue_); diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index 9e1d110c..bfd1763c 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -49,6 +49,7 @@ class TestBlas: public Tester<T,U> { const std::vector<size_t> kIncrements = { 1, 2, 7 }; const std::vector<size_t> kMatrixDims = { 7, 64 }; const std::vector<size_t> kMatrixVectorDims = { 61, 512 }; + const std::vector<size_t> kBandSizes = { 4, 19 }; const std::vector<size_t> kOffsets = GetOffsets(); const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_); const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_); @@ -90,6 +91,7 @@ class TestBlas: public Tester<T,U> { std::vector<T> a_source_; std::vector<T> b_source_; std::vector<T> c_source_; + std::vector<T> ap_source_; std::vector<T> dot_source_; // The routine-specific functions passed to the tester @@ -121,6 +123,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name auto ms = std::vector<size_t>{args.m}; auto ns = std::vector<size_t>{args.n}; auto ks = std::vector<size_t>{args.k}; + auto kus = std::vector<size_t>{args.ku}; + auto kls = std::vector<size_t>{args.kl}; auto layouts = std::vector<Layout>{args.layout}; auto a_transposes = std::vector<Transpose>{args.a_transpose}; auto b_transposes = std::vector<Transpose>{args.b_transpose}; @@ -137,6 +141,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name auto a_offsets = std::vector<size_t>{args.a_offset}; auto b_offsets = std::vector<size_t>{args.b_offset}; auto c_offsets = std::vector<size_t>{args.c_offset}; + auto ap_offsets = std::vector<size_t>{args.ap_offset}; auto dot_offsets = std::vector<size_t>{args.dot_offset}; auto alphas = std::vector<U>{args.alpha}; auto betas = std::vector<U>{args.beta}; @@ -145,6 +150,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name auto a_sizes = std::vector<size_t>{args.a_size}; auto b_sizes = std::vector<size_t>{args.b_size}; auto c_sizes = std::vector<size_t>{args.c_size}; + auto ap_sizes = std::vector<size_t>{args.ap_size}; // Sets the dimensions of the matrices or vectors depending on the BLAS level auto dimensions = (C::BLASLevel() == 3) ? tester.kMatrixDims : @@ -156,6 +162,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name if (option == kArgM) { ms = dimensions; } if (option == kArgN) { ns = dimensions; } if (option == kArgK) { ks = dimensions; } + if (option == kArgKU) { kus = tester.kBandSizes; } + if (option == kArgKL) { kls = tester.kBandSizes; } if (option == kArgLayout) { layouts = tester.kLayouts; } if (option == kArgATransp) { a_transposes = C::GetATransposes(tester.kTransposes); } if (option == kArgBTransp) { b_transposes = C::GetBTransposes(tester.kTransposes); } @@ -172,6 +180,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name if (option == kArgAOffset) { a_offsets = tester.kOffsets; } if (option == kArgBOffset) { b_offsets = tester.kOffsets; } if (option == kArgCOffset) { c_offsets = tester.kOffsets; } + if (option == kArgAPOffset) { ap_offsets = tester.kOffsets; } if (option == kArgDotOffset) { dot_offsets = tester.kOffsets; } if (option == kArgAlpha) { alphas = tester.kAlphaValues; } if (option == kArgBeta) { betas = tester.kBetaValues; } @@ -181,6 +190,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name if (option == kArgAOffset) { a_sizes = tester.kMatSizes; } if (option == kArgBOffset) { b_sizes = tester.kMatSizes; } if (option == kArgCOffset) { c_sizes = tester.kMatSizes; } + if (option == kArgAPOffset) { ap_sizes = tester.kMatSizes; } } // Loops over the test-cases from a data-layout point of view @@ -197,21 +207,27 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name for (auto &m: ms) { r_args.m = m; for (auto &n: ns) { r_args.n = n; for (auto &k: ks) { r_args.k = k; - for (auto &x_inc: x_incs) { r_args.x_inc = x_inc; - for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset; - for (auto &y_inc: y_incs) { r_args.y_inc = y_inc; - for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset; - for (auto &a_ld: a_lds) { r_args.a_ld = a_ld; - for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset; - for (auto &b_ld: b_lds) { r_args.b_ld = b_ld; - for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset; - for (auto &c_ld: c_lds) { r_args.c_ld = c_ld; - for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset; - for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset; - for (auto &alpha: alphas) { r_args.alpha = alpha; - for (auto &beta: betas) { r_args.beta = beta; - C::SetSizes(r_args); - regular_test_vector.push_back(r_args); + for (auto &ku: kus) { r_args.ku = ku; + for (auto &kl: kls) { r_args.kl = kl; + for (auto &x_inc: x_incs) { r_args.x_inc = x_inc; + for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset; + for (auto &y_inc: y_incs) { r_args.y_inc = y_inc; + for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset; + for (auto &a_ld: a_lds) { r_args.a_ld = a_ld; + for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset; + for (auto &b_ld: b_lds) { r_args.b_ld = b_ld; + for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset; + for (auto &c_ld: c_lds) { r_args.c_ld = c_ld; + for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset; + for (auto &ap_offset: ap_offsets) { r_args.ap_offset = ap_offset; + for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset; + for (auto &alpha: alphas) { r_args.alpha = alpha; + for (auto &beta: betas) { r_args.beta = beta; + C::SetSizes(r_args); + regular_test_vector.push_back(r_args); + } + } + } } } } @@ -232,14 +248,16 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name // Creates the arguments vector for the invalid-buffer tests auto invalid_test_vector = std::vector<Arguments<U>>{}; auto i_args = args; - i_args.m = i_args.n = i_args.k = tester.kBufferSize; + i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize; i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize; for (auto &x_size: x_sizes) { i_args.x_size = x_size; for (auto &y_size: y_sizes) { i_args.y_size = y_size; for (auto &a_size: a_sizes) { i_args.a_size = a_size; for (auto &b_size: b_sizes) { i_args.b_size = b_size; for (auto &c_size: c_sizes) { i_args.c_size = c_size; - invalid_test_vector.push_back(i_args); + for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size; + invalid_test_vector.push_back(i_args); + } } } } diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index f792925e..350865f0 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -132,6 +132,8 @@ void Tester<T,U>::TestEnd() { if (o == kArgM) { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); } if (o == kArgN) { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); } if (o == kArgK) { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); } + if (o == kArgKU) { fprintf(stdout, "%s=%lu ", kArgKU, entry.args.ku); } + if (o == kArgKL) { fprintf(stdout, "%s=%lu ", kArgKL, entry.args.kl); } if (o == kArgLayout) { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);} if (o == kArgATransp) { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);} if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);} @@ -148,6 +150,7 @@ void Tester<T,U>::TestEnd() { if (o == kArgAOffset) { fprintf(stdout, "%s=%lu ", kArgAOffset, entry.args.a_offset);} if (o == kArgBOffset) { fprintf(stdout, "%s=%lu ", kArgBOffset, entry.args.b_offset);} if (o == kArgCOffset) { fprintf(stdout, "%s=%lu ", kArgCOffset, entry.args.c_offset);} + if (o == kArgAPOffset) { fprintf(stdout, "%s=%lu ", kArgAPOffset, entry.args.ap_offset);} if (o == kArgDotOffset){ fprintf(stdout, "%s=%lu ", kArgDotOffset, entry.args.dot_offset);} } fprintf(stdout, "\n"); diff --git a/test/performance/client.cc b/test/performance/client.cc index 9faa4dca..fb248854 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -48,9 +48,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric for (auto &o: options_) { // Data-sizes - if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); } - if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); } - if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); } + if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); } + if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); } + if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); } + if (o == kArgKU) { args.ku = GetArgument(argc, argv, help, kArgKU, 128UL); } + if (o == kArgKL) { args.kl = GetArgument(argc, argv, help, kArgKL, 128UL); } // Data-layouts if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); } @@ -73,6 +75,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric if (o == kArgAOffset) { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); } if (o == kArgBOffset) { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); } if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); } + if (o == kArgAPOffset) { args.ap_offset= GetArgument(argc, argv, help, kArgAPOffset, size_t{0}); } // Dot arguments if (o == kArgDotOffset) { args.dot_offset = GetArgument(argc, argv, help, kArgDotOffset, size_t{0}); } @@ -131,12 +134,14 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) std::vector<T> a_source(args.a_size); std::vector<T> b_source(args.b_size); std::vector<T> c_source(args.c_size); + std::vector<T> ap_source(args.ap_size); std::vector<T> dot_source(args.dot_size); PopulateVector(x_source); PopulateVector(y_source); PopulateVector(a_source); PopulateVector(b_source); PopulateVector(c_source); + PopulateVector(ap_source); PopulateVector(dot_source); // Creates the matrices on the device @@ -145,14 +150,16 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) auto a_mat = Buffer<T>(context, args.a_size); auto b_mat = Buffer<T>(context, args.b_size); auto c_mat = Buffer<T>(context, args.c_size); + auto ap_mat = Buffer<T>(context, args.ap_size); auto dot = Buffer<T>(context, args.dot_size); x_vec.Write(queue, args.x_size, x_source); y_vec.Write(queue, args.y_size, y_source); a_mat.Write(queue, args.a_size, a_source); b_mat.Write(queue, args.b_size, b_source); c_mat.Write(queue, args.c_size, c_source); + ap_mat.Write(queue, args.ap_size, ap_source); dot.Write(queue, args.dot_size, dot_source); - auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, dot}; + auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, dot}; // Runs the routines and collects the timings auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast"); @@ -225,8 +232,10 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas auto integers = std::vector<size_t>{}; for (auto &o: options_) { if (o == kArgM) { integers.push_back(args.m); } - if (o == kArgN) { integers.push_back(args.n); } + else if (o == kArgN) { integers.push_back(args.n); } else if (o == kArgK) { integers.push_back(args.k); } + else if (o == kArgKU) { integers.push_back(args.ku); } + else if (o == kArgKL) { integers.push_back(args.kl); } else if (o == kArgLayout) { integers.push_back(static_cast<size_t>(args.layout)); } else if (o == kArgSide) { integers.push_back(static_cast<size_t>(args.side)); } else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); } @@ -243,6 +252,7 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas else if (o == kArgAOffset) { integers.push_back(args.a_offset); } else if (o == kArgBOffset) { integers.push_back(args.b_offset); } else if (o == kArgCOffset) { integers.push_back(args.c_offset); } + else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); } else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); } } auto strings = std::vector<std::string>{}; diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h new file mode 100644 index 00000000..0e238804 --- /dev/null +++ b/test/routines/level2/xgbmv.h @@ -0,0 +1,140 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the Xgbmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XGBMV_H_ +#define CLBLAST_TEST_ROUTINES_XGBMV_H_ + +#include <vector> +#include <string> + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXgbmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgM, kArgN, kArgKL, kArgKU, + kArgLayout, kArgATransp, + kArgALeadDim, kArgXInc, kArgYInc, + kArgAOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments<T> &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + auto n_real = (a_transposed) ? args.m : args.n; + return n_real * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments<T> &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + auto m_real = (a_transposed) ? args.n : args.m; + return m_real * args.y_inc + args.y_offset; + } + static size_t GetSizeA(const Arguments<T> &args) { + auto a_rotated = (args.layout == Layout::kRowMajor); + auto a_two = (a_rotated) ? args.m : args.n; + return a_two * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &args) { return args.n; } + static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gbmv(args.layout, args.a_transpose, + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout), + static_cast<clblasTranspose>(args.a_transpose), + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.y_size, static_cast<T>(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { + auto a_transposed = (args.a_transpose != Transpose::kNo); + return (a_transposed) ? args.n : args.m; + } + static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + return 2 * args.m * args.n; + } + static size_t GetBytes(const Arguments<T> &args) { + auto a_rotated = (args.layout == Layout::kRowMajor); + auto a_one = (a_rotated) ? args.n : args.m; + auto a_two = (a_rotated) ? args.m : args.n; + return ((args.kl+args.ku+1)*a_two + 2*a_one + a_two) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XGBMV_H_ +#endif diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h new file mode 100644 index 00000000..34e1502f --- /dev/null +++ b/test/routines/level2/xhbmv.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the Xhbmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XHBMV_H_ +#define CLBLAST_TEST_ROUTINES_XHBMV_H_ + +#include <vector> +#include <string> + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXhbmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgN, kArgKL, + kArgLayout, kArgTriangle, + kArgALeadDim, kArgXInc, kArgYInc, + kArgAOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments<T> &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments<T> &args) { + return args.n * args.y_inc + args.y_offset; + } + static size_t GetSizeA(const Arguments<T> &args) { + return args.n * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &args) { return args.n; } + static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.y_size, static_cast<T>(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { + return args.n; + } + static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments<T> &args) { + return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XHBMV_H_ +#endif diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h new file mode 100644 index 00000000..8fd85b62 --- /dev/null +++ b/test/routines/level2/xhpmv.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the Xhpmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XHPMV_H_ +#define CLBLAST_TEST_ROUTINES_XHPMV_H_ + +#include <vector> +#include <string> + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXhpmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgN, + kArgLayout, kArgTriangle, + kArgXInc, kArgYInc, + kArgAPOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments<T> &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments<T> &args) { + return args.n * args.y_inc + args.y_offset; + } + static size_t GetSizeAP(const Arguments<T> &args) { + return ((args.n*(args.n+1)) / 2) + args.ap_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.ap_size = GetSizeAP(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hpmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.y_size, static_cast<T>(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { + return args.n; + } + static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments<T> &args) { + return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XHPMV_H_ +#endif diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h new file mode 100644 index 00000000..5bc17e49 --- /dev/null +++ b/test/routines/level2/xsbmv.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the Xsbmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XSBMV_H_ +#define CLBLAST_TEST_ROUTINES_XSBMV_H_ + +#include <vector> +#include <string> + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXsbmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgN, kArgKL, + kArgLayout, kArgTriangle, + kArgALeadDim, kArgXInc, kArgYInc, + kArgAOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments<T> &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments<T> &args) { + return args.n * args.y_inc + args.y_offset; + } + static size_t GetSizeA(const Arguments<T> &args) { + return args.n * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &args) { return args.n; } + static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Sbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.y_size, static_cast<T>(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { + return args.n; + } + static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments<T> &args) { + return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XSBMV_H_ +#endif diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h new file mode 100644 index 00000000..e335da42 --- /dev/null +++ b/test/routines/level2/xspmv.h @@ -0,0 +1,130 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the Xspmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XSPMV_H_ +#define CLBLAST_TEST_ROUTINES_XSPMV_H_ + +#include <vector> +#include <string> + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXspmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgN, + kArgLayout, kArgTriangle, + kArgXInc, kArgYInc, + kArgAPOffset, kArgXOffset, kArgYOffset, + kArgAlpha, kArgBeta}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments<T> &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeY(const Arguments<T> &args) { + return args.n * args.y_inc + args.y_offset; + } + static size_t GetSizeAP(const Arguments<T> &args) { + return ((args.n*(args.n+1)) / 2) + args.ap_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.ap_size = GetSizeAP(args); + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Spmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXspmv(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.y_size, static_cast<T>(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { + return args.n; + } + static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) { + return id1*args.y_inc + args.y_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments<T> &args) { + return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XSPMV_H_ +#endif diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h new file mode 100644 index 00000000..dbdddb65 --- /dev/null +++ b/test/routines/level2/xtbmv.h @@ -0,0 +1,125 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the Xtbmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XTBMV_H_ +#define CLBLAST_TEST_ROUTINES_XTBMV_H_ + +#include <vector> +#include <string> + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXtbmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgN, kArgKL, + kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal, + kArgALeadDim, kArgXInc, + kArgAOffset, kArgXOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments<T> &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeA(const Arguments<T> &args) { + return args.n * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &args) { return args.n; } + static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, args.kl, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + static_cast<clblasDiag>(args.diagonal), + args.n, args.kl, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.x_size, static_cast<T>(0)); + buffers.x_vec.Read(queue, args.x_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { + return args.n; + } + static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) { + return id1*args.x_inc + args.x_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments<T> &args) { + return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XTBMV_H_ +#endif diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h new file mode 100644 index 00000000..4425765e --- /dev/null +++ b/test/routines/level2/xtpmv.h @@ -0,0 +1,125 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the Xtpmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XTPMV_H_ +#define CLBLAST_TEST_ROUTINES_XTPMV_H_ + +#include <vector> +#include <string> + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXtpmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgN, + kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal, + kArgXInc, + kArgAPOffset, kArgXOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments<T> &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeAP(const Arguments<T> &args) { + return ((args.n*(args.n+1)) / 2) + args.ap_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.ap_size = GetSizeAP(args); + args.x_size = GetSizeX(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &args) { return args.n; } + static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + static_cast<clblasDiag>(args.diagonal), + args.n, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.x_size, static_cast<T>(0)); + buffers.x_vec.Read(queue, args.x_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { + return args.n; + } + static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) { + return id1*args.x_inc + args.x_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments<T> &args) { + return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XTPMV_H_ +#endif diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h new file mode 100644 index 00000000..1c0c6fd8 --- /dev/null +++ b/test/routines/level2/xtrmv.h @@ -0,0 +1,125 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the Xtrmv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XTRMV_H_ +#define CLBLAST_TEST_ROUTINES_XTRMV_H_ + +#include <vector> +#include <string> + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXtrmv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgN, + kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal, + kArgALeadDim, kArgXInc, + kArgAOffset, kArgXOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments<T> &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeA(const Arguments<T> &args) { + return args.n * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &args) { return args.n; } + static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + static_cast<clblasDiag>(args.diagonal), + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.x_size, static_cast<T>(0)); + buffers.x_vec.Read(queue, args.x_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { + return args.n; + } + static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) { + return id1*args.x_inc + args.x_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments<T> &args) { + return (args.n*args.n + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XTRMV_H_ +#endif diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 10c7dd47..23a02a45 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -238,7 +238,7 @@ clblasStatus clblasXdot<float>(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<float>(context, n); + auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset); return clblasSdot(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast<int>(x_inc), @@ -255,7 +255,7 @@ clblasStatus clblasXdot<double>(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<double>(context, n); + auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset); return clblasDdot(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast<int>(x_inc), @@ -281,7 +281,7 @@ clblasStatus clblasXdotu<float2>(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<float2>(context, n); + auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset); return clblasCdotu(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast<int>(x_inc), @@ -298,7 +298,7 @@ clblasStatus clblasXdotu<double2>(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<double2>(context, n); + auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset); return clblasZdotu(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast<int>(x_inc), @@ -324,7 +324,7 @@ clblasStatus clblasXdotc<float2>(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<float2>(context, n); + auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset); return clblasCdotc(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast<int>(x_inc), @@ -341,7 +341,7 @@ clblasStatus clblasXdotc<double2>(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<double2>(context, n); + auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset); return clblasZdotc(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast<int>(x_inc), @@ -747,7 +747,7 @@ clblasStatus clblasXtrmv<float>(const clblasOrder layout, const clblasUplo trian cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<float>(context, n); + auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset); return clblasStrmv(layout, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, @@ -764,7 +764,7 @@ clblasStatus clblasXtrmv<double>(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<double>(context, n); + auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset); return clblasDtrmv(layout, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, @@ -781,7 +781,7 @@ clblasStatus clblasXtrmv<float2>(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<float2>(context, n); + auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset); return clblasCtrmv(layout, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, @@ -798,7 +798,7 @@ clblasStatus clblasXtrmv<double2>(const clblasOrder layout, const clblasUplo tri cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<double2>(context, n); + auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset); return clblasZtrmv(layout, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, @@ -824,7 +824,7 @@ clblasStatus clblasXtbmv<float>(const clblasOrder layout, const clblasUplo trian cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<float>(context, n); + auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset); return clblasStbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, @@ -841,7 +841,7 @@ clblasStatus clblasXtbmv<double>(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<double>(context, n); + auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset); return clblasDtbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, @@ -858,7 +858,7 @@ clblasStatus clblasXtbmv<float2>(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<float2>(context, n); + auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset); return clblasCtbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, @@ -875,7 +875,7 @@ clblasStatus clblasXtbmv<double2>(const clblasOrder layout, const clblasUplo tri cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<double2>(context, n); + auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset); return clblasZtbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, @@ -901,7 +901,7 @@ clblasStatus clblasXtpmv<float>(const clblasOrder layout, const clblasUplo trian cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<float>(context, n); + auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset); return clblasStpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, @@ -918,7 +918,7 @@ clblasStatus clblasXtpmv<double>(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<double>(context, n); + auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset); return clblasDtpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, @@ -935,7 +935,7 @@ clblasStatus clblasXtpmv<float2>(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<float2>(context, n); + auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset); return clblasCtpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, @@ -952,7 +952,7 @@ clblasStatus clblasXtpmv<double2>(const clblasOrder layout, const clblasUplo tri cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer<double2>(context, n); + auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset); return clblasZtpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, |