summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2015-09-26 17:02:34 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2015-09-26 17:02:34 +0200
commit92b4b0d1feaaf92e160fa0342daf4269f24fb4d2 (patch)
tree3356b67a281d8292e893028d74a1801554ce0ef2
parent42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff)
parent2b56c2c60325f02bc695cbb968049cc09205c713 (diff)
Merge pull request #27 from CNugteren/level2_matrix_vector
Added many level-2 matrix-vector routines
-rw-r--r--.gitignore3
-rw-r--r--CHANGELOG10
-rw-r--r--CMakeLists.txt2
-rw-r--r--README.md20
-rw-r--r--include/internal/clpp11.h4
-rw-r--r--include/internal/routine.h2
-rw-r--r--include/internal/routines/level2/xgbmv.h49
-rw-r--r--include/internal/routines/level2/xgemv.h13
-rw-r--r--include/internal/routines/level2/xhbmv.h49
-rw-r--r--include/internal/routines/level2/xhemv.h17
-rw-r--r--include/internal/routines/level2/xhpmv.h49
-rw-r--r--include/internal/routines/level2/xsbmv.h49
-rw-r--r--include/internal/routines/level2/xspmv.h49
-rw-r--r--include/internal/routines/level2/xsymv.h15
-rw-r--r--include/internal/routines/level2/xtbmv.h51
-rw-r--r--include/internal/routines/level2/xtpmv.h51
-rw-r--r--include/internal/routines/level2/xtrmv.h51
-rw-r--r--include/internal/utilities.h20
-rw-r--r--scripts/generator/generator.py20
-rw-r--r--src/clblast.cc213
-rw-r--r--src/kernels/level2/xgemv.opencl (renamed from src/kernels/xgemv.opencl)208
-rw-r--r--src/routine.cc12
-rw-r--r--src/routines/level2/xgbmv.cc67
-rw-r--r--src/routines/level2/xgemv.cc60
-rw-r--r--src/routines/level2/xhbmv.cc64
-rw-r--r--src/routines/level2/xhemv.cc66
-rw-r--r--src/routines/level2/xhpmv.cc64
-rw-r--r--src/routines/level2/xsbmv.cc64
-rw-r--r--src/routines/level2/xspmv.cc64
-rw-r--r--src/routines/level2/xsymv.cc66
-rw-r--r--src/routines/level2/xtbmv.cc81
-rw-r--r--src/routines/level2/xtpmv.cc81
-rw-r--r--src/routines/level2/xtrmv.cc81
-rw-r--r--src/tuning/xgemv.cc2
-rw-r--r--test/correctness/testblas.cc18
-rw-r--r--test/correctness/testblas.h52
-rw-r--r--test/correctness/tester.cc3
-rw-r--r--test/performance/client.cc20
-rw-r--r--test/routines/level2/xgbmv.h140
-rw-r--r--test/routines/level2/xhbmv.h130
-rw-r--r--test/routines/level2/xhpmv.h130
-rw-r--r--test/routines/level2/xsbmv.h130
-rw-r--r--test/routines/level2/xspmv.h130
-rw-r--r--test/routines/level2/xtbmv.h125
-rw-r--r--test/routines/level2/xtpmv.h125
-rw-r--r--test/routines/level2/xtrmv.h125
-rw-r--r--test/wrapper_clblas.h36
47 files changed, 2585 insertions, 296 deletions
diff --git a/.gitignore b/.gitignore
index 604b0a64..de7becef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
build
stash
-.* \ No newline at end of file
+.*
+*.pyc \ No newline at end of file
diff --git a/CHANGELOG b/CHANGELOG
index 5a91d171..c1981d7a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,6 @@
Development version (next release)
+- Improved structure and performance of level-2 routines (xSYMV/xHEMV)
- Added level-1 routines:
* SSWAP/DSWAP/CSWAP/ZSWAP
* SSCAL/DSCAL/CSCAL/ZSCAL
@@ -7,6 +8,15 @@ Development version (next release)
* SDOT/DDOT
* CDOTU/ZDOTU
* CDOTC/ZDOTC
+- Added level-2 routines:
+ * SGBMV/DGBMV/CGBMV/ZGBMV
+ * CHBMV/ZHBMV
+ * CHPMV/ZHPMV
+ * SSBMV/DSBMV
+ * SSPMV/DSPMV
+ * STRMV/DTRMV/CTRMV/ZTRMV
+ * STBMV/DTBMV/CTBMV/ZTBMV
+ * STPMV/DTPMV/CTPMV/ZTPMV
Version 0.4.0
- Now using the Claduc C++11 interface to OpenCL
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1960bf1d..1ddd2f77 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,7 +106,7 @@ set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm)
set(SAMPLE_PROGRAMS_CPP sgemm)
set(SAMPLE_PROGRAMS_C sgemm)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
-set(LEVEL2_ROUTINES xgemv xhemv xsymv)
+set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
set(PRECISIONS 32 3232 64 6464)
diff --git a/README.md b/README.md
index 5ffc18d6..8c7870a2 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ CLBlast: The tuned OpenCL BLAS library
CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
-__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
+__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
Why CLBlast and not clBLAS or cuBLAS?
@@ -130,7 +130,7 @@ These graphs can be generated automatically on your own device. First, compile C
Supported routines
-------------
-CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
+CLBlast is in active development but already supports the majority of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
| Level-1 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
@@ -153,16 +153,16 @@ CLBlast is in active development and currently does not support the full set of
| Level-2 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMV | ✔ | ✔ | ✔ | ✔ | |
-| xGBMV | | | | | |
+| xGBMV | ✔ | ✔ | ✔ | ✔ | |
| xHEMV | - | - | ✔ | ✔ | |
-| xHBMV | - | - | | | |
-| xHPMV | - | - | | | |
+| xHBMV | - | - | ✔ | ✔ | |
+| xHPMV | - | - | ✔ | ✔ | |
| xSYMV | ✔ | ✔ | - | - | |
-| xSBMV | | | - | - | |
-| xSPMV | | | - | - | |
-| xTRMV | | | | | |
-| xTBMV | | | | | |
-| xTPMV | | | | | |
+| xSBMV | ✔ | ✔ | - | - | |
+| xSPMV | ✔ | ✔ | - | - | |
+| xTRMV | ✔ | ✔ | ✔ | ✔ | |
+| xTBMV | ✔ | ✔ | ✔ | ✔ | |
+| xTPMV | ✔ | ✔ | ✔ | ✔ | |
| xTRSV | | | | | |
| xTBSV | | | | | |
| xTPSV | | | | | |
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
index 2c2cc797..df7a0d82 100644
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@@ -493,11 +493,11 @@ class Buffer {
}
// Copies the contents of this buffer into another device buffer
- void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) {
+ void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0,
nullptr, nullptr));
}
- void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) {
+ void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
CopyToAsync(queue, size, destination);
queue.Finish();
}
diff --git a/include/internal/routine.h b/include/internal/routine.h
index c5b253b4..b7c06a97 100644
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@@ -72,6 +72,8 @@ class Routine {
const size_t offset, const size_t ld, const size_t data_size);
StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld, const size_t data_size);
+ StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer,
+ const size_t offset, const size_t data_size);
// Tests for valid inputs of vectors X and Y
StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h
new file mode 100644
index 00000000..27b033e9
--- /dev/null
+++ b/include/internal/routines/level2/xgbmv.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGBMV_H_
+#define CLBLAST_ROUTINES_XGBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgbmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h
index 1e120a5e..b31565ec 100644
--- a/include/internal/routines/level2/xgemv.h
+++ b/include/internal/routines/level2/xgemv.h
@@ -32,6 +32,7 @@ class Xgemv: public Routine<T> {
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
using Routine<T>::TestMatrixA;
+ using Routine<T>::TestMatrixAP;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
@@ -47,6 +48,18 @@ class Xgemv: public Routine<T> {
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+ // Generic version used also for other matrix-vector multiplications
+ StatusCode MatVec(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ bool fast_kernel, bool fast_kernel_rot,
+ const size_t parameter, const bool packed,
+ const size_t kl, const size_t ku);
+
private:
// Static variable to get the precision
const static Precision precision_;
diff --git a/include/internal/routines/level2/xhbmv.h b/include/internal/routines/level2/xhbmv.h
new file mode 100644
index 00000000..65138424
--- /dev/null
+++ b/include/internal/routines/level2/xhbmv.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHBMV_H_
+#define CLBLAST_ROUTINES_XHBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhbmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xhbmv(Queue &queue, Event &event, const std::string &name = "HBMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h
index 801b2fc3..b74db760 100644
--- a/include/internal/routines/level2/xhemv.h
+++ b/include/internal/routines/level2/xhemv.h
@@ -7,8 +7,9 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
-// This file implements the Xhemv routine. It is based on the generalized matrix multiplication
-// routine (Xgemv). The implementation is very similar to the Xsymv routine.
+// This file implements the Xhemv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
//
// =================================================================================================
@@ -25,16 +26,8 @@ template <typename T>
class Xhemv: public Xgemv<T> {
public:
- // Members and methods from the base class
- using Routine<T>::db_;
- using Routine<T>::context_;
- using Routine<T>::GetProgramFromCache;
- using Routine<T>::TestMatrixA;
- using Routine<T>::RunKernel;
- using Routine<T>::ErrorIn;
-
- // Uses the regular Xgemv routine
- using Xgemv<T>::DoGemv;
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
// Constructor
Xhemv(Queue &queue, Event &event, const std::string &name = "HEMV");
diff --git a/include/internal/routines/level2/xhpmv.h b/include/internal/routines/level2/xhpmv.h
new file mode 100644
index 00000000..48f1ed3f
--- /dev/null
+++ b/include/internal/routines/level2/xhpmv.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPMV_H_
+#define CLBLAST_ROUTINES_XHPMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhpmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xhpmv(Queue &queue, Event &event, const std::string &name = "HPMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoHpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPMV_H_
+#endif
diff --git a/include/internal/routines/level2/xsbmv.h b/include/internal/routines/level2/xsbmv.h
new file mode 100644
index 00000000..bb24d8f4
--- /dev/null
+++ b/include/internal/routines/level2/xsbmv.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xsbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSBMV_H_
+#define CLBLAST_ROUTINES_XSBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsbmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xsbmv(Queue &queue, Event &event, const std::string &name = "SBMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xspmv.h b/include/internal/routines/level2/xspmv.h
new file mode 100644
index 00000000..88f02a2f
--- /dev/null
+++ b/include/internal/routines/level2/xspmv.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xspmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPMV_H_
+#define CLBLAST_ROUTINES_XSPMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspmv: public Xgemv<T> {
+ public:
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xspmv(Queue &queue, Event &event, const std::string &name = "SPMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoSpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPMV_H_
+#endif
diff --git a/include/internal/routines/level2/xsymv.h b/include/internal/routines/level2/xsymv.h
index ab6da6d1..c7b92702 100644
--- a/include/internal/routines/level2/xsymv.h
+++ b/include/internal/routines/level2/xsymv.h
@@ -9,8 +9,7 @@
//
// This file implements the Xsymv routine. It is based on the generalized mat-vec multiplication
// routine (Xgemv). The Xsymv class inherits from the templated class Xgemv, allowing it to call the
-// "DoGemm" function directly. The "DoSymv" function first preprocesses the symmetric matrix by
-// transforming it into a general matrix, and then calls the regular GEMV code.
+// "MatVec" function directly.
//
// =================================================================================================
@@ -27,16 +26,8 @@ template <typename T>
class Xsymv: public Xgemv<T> {
public:
- // Members and methods from the base class
- using Routine<T>::db_;
- using Routine<T>::context_;
- using Routine<T>::GetProgramFromCache;
- using Routine<T>::TestMatrixA;
- using Routine<T>::RunKernel;
- using Routine<T>::ErrorIn;
-
- // Uses the regular Xgemv routine
- using Xgemv<T>::DoGemv;
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
// Constructor
Xsymv(Queue &queue, Event &event, const std::string &name = "SYMV");
diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h
new file mode 100644
index 00000000..89c90193
--- /dev/null
+++ b/include/internal/routines/level2/xtbmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTBMV_H_
+#define CLBLAST_ROUTINES_XTBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtbmv: public Xgemv<T> {
+ public:
+
+ // Members from the base class
+ using Routine<T>::queue_;
+ using Routine<T>::context_;
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoTbmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h
new file mode 100644
index 00000000..183d3505
--- /dev/null
+++ b/include/internal/routines/level2/xtpmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTPMV_H_
+#define CLBLAST_ROUTINES_XTPMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtpmv: public Xgemv<T> {
+ public:
+
+ // Members from the base class
+ using Routine<T>::queue_;
+ using Routine<T>::context_;
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoTpmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTPMV_H_
+#endif
diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h
new file mode 100644
index 00000000..dadfbc98
--- /dev/null
+++ b/include/internal/routines/level2/xtrmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMV_H_
+#define CLBLAST_ROUTINES_XTRMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmv: public Xgemv<T> {
+ public:
+
+ // Members from the base class
+ using Routine<T>::queue_;
+ using Routine<T>::context_;
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoTrmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMV_H_
+#endif
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
index 466ac810..bd174ccb 100644
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@@ -41,6 +41,8 @@ const std::string kKhronosDoublePrecision = "cl_khr_fp64";
constexpr auto kArgM = "m";
constexpr auto kArgN = "n";
constexpr auto kArgK = "k";
+constexpr auto kArgKL = "kl";
+constexpr auto kArgKU = "ku";
constexpr auto kArgLayout = "layout";
constexpr auto kArgATransp = "transA";
constexpr auto kArgBTransp = "transB";
@@ -57,6 +59,7 @@ constexpr auto kArgCLeadDim = "ldc";
constexpr auto kArgAOffset = "offa";
constexpr auto kArgBOffset = "offb";
constexpr auto kArgCOffset = "offc";
+constexpr auto kArgAPOffset = "offap";
constexpr auto kArgDotOffset = "offdot";
constexpr auto kArgAlpha = "alpha";
constexpr auto kArgBeta = "beta";
@@ -87,9 +90,11 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
template <typename T>
struct Arguments {
// Routine-specific arguments
- size_t m = 0;
- size_t n = 0;
- size_t k = 0;
+ size_t m = 1;
+ size_t n = 1;
+ size_t k = 1;
+ size_t ku = 1;
+ size_t kl = 1;
Layout layout = Layout::kRowMajor;
Transpose a_transpose = Transpose::kNo;
Transpose b_transpose = Transpose::kNo;
@@ -100,12 +105,13 @@ struct Arguments {
size_t y_inc = 1;
size_t x_offset = 0;
size_t y_offset = 0;
- size_t a_ld = 0;
- size_t b_ld = 0;
- size_t c_ld = 0;
+ size_t a_ld = 1;
+ size_t b_ld = 1;
+ size_t c_ld = 1;
size_t a_offset = 0;
size_t b_offset = 0;
size_t c_offset = 0;
+ size_t ap_offset = 0;
size_t dot_offset = 0;
T alpha = T{1.0};
T beta = T{1.0};
@@ -114,6 +120,7 @@ struct Arguments {
size_t a_size = 1;
size_t b_size = 1;
size_t c_size = 1;
+ size_t ap_size = 1;
size_t dot_size = 1;
// Tuner-specific arguments
double fraction = 1.0;
@@ -141,6 +148,7 @@ struct Buffers {
Buffer<T> a_mat;
Buffer<T> b_mat;
Buffer<T> c_mat;
+ Buffer<T> ap_mat;
Buffer<T> dot;
};
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 677c8afc..25f02861 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -64,16 +64,16 @@ routines = [
],
[ # Level 2: matrix-vector
Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"),
- Routine(False, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"),
+ Routine(True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"),
Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"),
- Routine(False, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"),
- Routine(False, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"),
+ Routine(True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"),
+ Routine(True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"),
Routine(True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"),
- Routine(False, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"),
- Routine(False, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"),
- Routine(False, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"),
- Routine(False, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"),
- Routine(False, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"),
+ Routine(True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"),
+ Routine(True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"),
+ Routine(True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"),
+ Routine(True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"),
+ Routine(True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"),
Routine(False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"),
Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
@@ -213,7 +213,7 @@ def wrapper_clblas(routines):
if routine.scratch:
result += " auto queue = Queue(queues[0]);\n"
result += " auto context = queue.GetContext();\n"
- result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, n);\n"
+ result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, n*x_inc + x_offset);\n"
arguments += ["scratch_buffer()"]
result += " return clblas"+flavour.name+routine.name+"("
result += (",\n"+indent).join([a for a in arguments])
@@ -237,7 +237,7 @@ files = [
path_clblast+"/src/clblast_c.cc",
path_clblast+"/test/wrapper_clblas.h",
]
-header_lines = [84, 44, 80, 24, 22]
+header_lines = [84, 52, 80, 24, 22]
footer_lines = [6, 3, 5, 2, 6]
# Checks whether the command-line arguments are valid; exists otherwise
diff --git a/src/clblast.cc b/src/clblast.cc
index a0dd8c70..77999aaf 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -28,8 +28,16 @@
// BLAS level-2 includes
#include "internal/routines/level2/xgemv.h"
+#include "internal/routines/level2/xgbmv.h"
#include "internal/routines/level2/xhemv.h"
+#include "internal/routines/level2/xhbmv.h"
+#include "internal/routines/level2/xhpmv.h"
#include "internal/routines/level2/xsymv.h"
+#include "internal/routines/level2/xsbmv.h"
+#include "internal/routines/level2/xspmv.h"
+#include "internal/routines/level2/xtrmv.h"
+#include "internal/routines/level2/xtbmv.h"
+#include "internal/routines/level2/xtpmv.h"
// BLAS level-3 includes
#include "internal/routines/level3/xgemm.h"
@@ -327,15 +335,26 @@ template StatusCode Gemv<double2>(const Layout, const Transpose,
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
template <typename T>
-StatusCode Gbmv(const Layout, const Transpose,
- const size_t, const size_t, const size_t, const size_t,
- const T,
- const cl_mem, const size_t, const size_t,
- const cl_mem, const size_t, const size_t,
- const T,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xgbmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoGbmv(layout, a_transpose,
+ m, n, kl, ku,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Gbmv<float>(const Layout, const Transpose,
const size_t, const size_t, const size_t, const size_t,
@@ -412,15 +431,26 @@ template StatusCode Hemv<double2>(const Layout, const Triangle,
// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
template <typename T>
-StatusCode Hbmv(const Layout, const Triangle,
- const size_t, const size_t,
- const T,
- const cl_mem, const size_t, const size_t,
- const cl_mem, const size_t, const size_t,
- const T,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Hbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xhbmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoHbmv(layout, triangle,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Hbmv<float2>(const Layout, const Triangle,
const size_t, const size_t,
@@ -441,15 +471,26 @@ template StatusCode Hbmv<double2>(const Layout, const Triangle,
// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
template <typename T>
-StatusCode Hpmv(const Layout, const Triangle,
- const size_t,
- const T,
- const cl_mem, const size_t,
- const cl_mem, const size_t, const size_t,
- const T,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Hpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xhpmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoHpmv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Hpmv<float2>(const Layout, const Triangle,
const size_t,
@@ -510,15 +551,26 @@ template StatusCode Symv<double>(const Layout, const Triangle,
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
template <typename T>
-StatusCode Sbmv(const Layout, const Triangle,
- const size_t, const size_t,
- const T,
- const cl_mem, const size_t, const size_t,
- const cl_mem, const size_t, const size_t,
- const T,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Sbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xsbmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoSbmv(layout, triangle,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Sbmv<float>(const Layout, const Triangle,
const size_t, const size_t,
@@ -539,15 +591,26 @@ template StatusCode Sbmv<double>(const Layout, const Triangle,
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
template <typename T>
-StatusCode Spmv(const Layout, const Triangle,
- const size_t,
- const T,
- const cl_mem, const size_t,
- const cl_mem, const size_t, const size_t,
- const T,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Spmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xspmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoSpmv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Spmv<float>(const Layout, const Triangle,
const size_t,
@@ -568,12 +631,20 @@ template StatusCode Spmv<double>(const Layout, const Triangle,
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
template <typename T>
-StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal,
- const size_t,
- const cl_mem, const size_t, const size_t,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xtrmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoTrmv(layout, triangle, a_transpose, diagonal,
+ n,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc);
}
template StatusCode Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
@@ -598,12 +669,20 @@ template StatusCode Trmv<double2>(const Layout, const Triangle, const Transpose,
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
template <typename T>
-StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal,
- const size_t, const size_t,
- const cl_mem, const size_t, const size_t,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xtbmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoTbmv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc);
}
template StatusCode Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
@@ -628,12 +707,20 @@ template StatusCode Tbmv<double2>(const Layout, const Triangle, const Transpose,
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
template <typename T>
-StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal,
- const size_t,
- const cl_mem, const size_t,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xtpmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoTpmv(layout, triangle, a_transpose, diagonal,
+ n,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
}
template StatusCode Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
diff --git a/src/kernels/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 1e12dd78..8ed0e9e4 100644
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -79,22 +79,189 @@ R"(
#endif
// =================================================================================================
-// Defines how to load the input matrix in the regular case
-// Loads a scalar input value
+// Defines how to load the input matrix in the non-vectorized case
inline real LoadMatrixA(const __global real* restrict agm, const int x, const int y,
- const int a_ld, const int a_offset) {
- return agm[x + a_ld*y + a_offset];
+ const int a_ld, const int a_offset, const int parameter,
+ const int kl, const int ku) {
+ real result;
+
+ // For banded matrices
+ #if defined(ROUTINE_GBMV)
+ const int k = ku - y;
+ if (x >= y-ku && x < y+kl+1) { result = agm[a_ld*y + k + x + a_offset]; }
+ else { SetToZero(result); }
+
+ // For symmetric/hermitian matrices
+ #elif defined(ROUTINE_HEMV) || defined(ROUTINE_SYMV)
+ if ((parameter == 0 && y <= x) || (parameter == 1 && x <= y)) {
+ result = agm[a_ld*y + x + a_offset];
+ #if defined(ROUTINE_HEMV)
+ if (x == y) { result.y = ZERO; }
+ #endif
+ }
+ else {
+ result = agm[a_ld*x + y + a_offset];
+ #if defined(ROUTINE_HEMV)
+ COMPLEX_CONJUGATE(result);
+ #endif
+ }
+
+ // For triangular matrices
+ #elif defined(ROUTINE_TRMV)
+ if (((parameter == 0 || parameter == 2) && y <= x) ||
+ ((parameter == 1 || parameter == 3) && x <= y)) {
+ result = agm[a_ld*y + x + a_offset];
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+
+ // For symmetric/hermitian banded matrices
+ #elif defined(ROUTINE_HBMV) || defined(ROUTINE_SBMV)
+ if (parameter == 1) {
+ if (x <= y) {
+ const int m = kl - y;
+ if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; }
+ else { SetToZero(result); }
+ #if defined(ROUTINE_HBMV)
+ if (x == y) { result.y = ZERO; }
+ #endif
+ }
+ else {
+ const int m = kl - x;
+ if (y >= x-kl && y <= x) { result = agm[a_ld*x + m + y + a_offset]; }
+ else { SetToZero(result); }
+ #if defined(ROUTINE_HBMV)
+ COMPLEX_CONJUGATE(result);
+ #endif
+ }
+ }
+ else {
+ if (x >= y) {
+ const int m = -y;
+ if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; }
+ else { SetToZero(result); }
+ #if defined(ROUTINE_HBMV)
+ if (x == y) { result.y = ZERO; }
+ #endif
+ }
+ else {
+ const int m = -x;
+ if (y >= x && y < x+kl+1) { result = agm[a_ld*x + m + y + a_offset]; }
+ else { SetToZero(result); }
+ #if defined(ROUTINE_HBMV)
+ COMPLEX_CONJUGATE(result);
+ #endif
+ }
+ }
+
+ // For triangular banded matrices
+ #elif defined(ROUTINE_TBMV)
+ if (parameter == 1 || parameter == 3) {
+ if (x <= y) {
+ const int m = kl - y;
+ if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; }
+ else { SetToZero(result); }
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+ else {
+ if (x >= y) {
+ const int m = -y;
+ if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; }
+ else { SetToZero(result); }
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+
+ // For symmetric/hermitian packed matrices
+ #elif defined(ROUTINE_HPMV) || defined(ROUTINE_SPMV)
+ if (parameter == 1) {
+ if (x <= y) {
+ result = agm[((y+1)*y)/2 + x + a_offset];
+ #if defined(ROUTINE_HPMV)
+ if (x == y) { result.y = ZERO; }
+ #endif
+ }
+ else {
+ result = agm[((x+1)*x)/2 + y + a_offset];
+ #if defined(ROUTINE_HPMV)
+ COMPLEX_CONJUGATE(result);
+ #endif
+ }
+ }
+ else {
+ if (x >= y) {
+ result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset];
+ #if defined(ROUTINE_HPMV)
+ if (x == y) { result.y = ZERO; }
+ #endif
+ }
+ else {
+ result = agm[((2*a_ld-(x+1))*x)/2 + y + a_offset];
+ #if defined(ROUTINE_HPMV)
+ COMPLEX_CONJUGATE(result);
+ #endif
+ }
+ }
+
+ // For triangular packed matrices
+ #elif defined(ROUTINE_TPMV)
+ if (parameter == 1 || parameter == 3) {
+ if (x <= y) {
+ result = agm[((y+1)*y)/2 + x + a_offset];
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+ else {
+ if (x >= y) {
+ result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset];
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+
+ // For general matrices
+ #else
+ result = agm[a_ld*y + x + a_offset];
+ #endif
+
+ return result;
}
+
// Loads a vector input value (1/2)
inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
const int a_ld) {
- return agm[x + a_ld*y];
+ return agm[a_ld*y + x];
}
+
// Loads a vector input value (2/2): as before, but different data-type
inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
const int a_ld) {
- return agm[x + a_ld*y];
+ return agm[a_ld*y + x];
}
// =================================================================================================
@@ -106,7 +273,8 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
const __global real* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
- const int do_conjugate) {
+ const int do_conjugate, const int parameter,
+ const int kl, const int ku) {
// Local memory for the vector X
__local real xlm[WGS1];
@@ -141,20 +309,20 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
// The multiply-add function for the main part (divisable by WGS1)
if (a_rotated == 0) { // Not rotated
#pragma unroll
- for (int kl=0; kl<WGS1; ++kl) {
- const int k = kwg + kl;
- real value = LoadMatrixA(agm, gid, k, a_ld, a_offset);
+ for (int kloop=0; kloop<WGS1; ++kloop) {
+ const int k = kwg + kloop;
+ real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
- MultiplyAdd(acc[w], xlm[kl], value);
+ MultiplyAdd(acc[w], xlm[kloop], value);
}
}
else { // Transposed
#pragma unroll
- for (int kl=0; kl<WGS1; ++kl) {
- const int k = kwg + kl;
- real value = LoadMatrixA(agm, k, gid, a_ld, a_offset);
+ for (int kloop=0; kloop<WGS1; ++kloop) {
+ const int k = kwg + kloop;
+ real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
- MultiplyAdd(acc[w], xlm[kl], value);
+ MultiplyAdd(acc[w], xlm[kloop], value);
}
}
}
@@ -174,7 +342,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
if (a_rotated == 0) { // Not rotated
#pragma unroll
for (int k=n_floor; k<n; ++k) {
- real value = LoadMatrixA(agm, gid, k, a_ld, a_offset);
+ real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value);
}
@@ -182,7 +350,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
else { // Transposed
#pragma unroll
for (int k=n_floor; k<n; ++k) {
- real value = LoadMatrixA(agm, k, gid, a_ld, a_offset);
+ real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value);
}
@@ -209,7 +377,8 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b
const __global realVF* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
- const int do_conjugate) {
+ const int do_conjugate, const int parameter,
+ const int kl, const int ku) {
// Local memory for the vector X
__local real xlm[WGS2];
@@ -305,7 +474,8 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
- const int do_conjugate) {
+ const int do_conjugate, const int parameter,
+ const int kl, const int ku) {
// Local memory for the vector X
__local real xlm[WGS3];
diff --git a/src/routine.cc b/src/routine.cc
index 05a03683..2978c94a 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -191,6 +191,18 @@ StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buf
return StatusCode::kSuccess;
}
+// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size
+template <typename T>
+StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer,
+ const size_t offset, const size_t data_size) {
+ try {
+ auto required_size = (((n*(n+1))/2) + offset)*data_size;
+ auto buffer_size = buffer.GetSize();
+ if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
+ } catch (...) { return StatusCode::kInvalidMatrixA; }
+ return StatusCode::kSuccess;
+}
+
// =================================================================================================
// Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc
new file mode 100644
index 00000000..14d391ca
--- /dev/null
+++ b/src/routines/level2/xgbmv.cc
@@ -0,0 +1,67 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xgbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+ // Reverses the upper and lower band count
+ auto rotated = (layout == Layout::kRowMajor);
+ auto kl_real = (rotated) ? ku : kl;
+ auto ku_real = (rotated) ? kl : ku;
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_GBMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, a_transpose,
+ m, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ 0, false, kl_real, ku_real);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgbmv<float>;
+template class Xgbmv<double>;
+template class Xgbmv<float2>;
+template class Xgbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
index f95a9957..1b768dcd 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -32,8 +32,7 @@ template <typename T>
Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
source_string_ =
- #include "../../kernels/pad.opencl" // For {Herm,Symm}{Upper,Lower}ToSquared (for HEMV/SYMV)
- #include "../../kernels/xgemv.opencl"
+ #include "../../kernels/level2/xgemv.opencl"
;
}
@@ -49,6 +48,31 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+ // Performs the matrix-vector multiplication
+ return MatVec(layout, a_transpose,
+ m, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ true, true,
+ 0, false, 0, 0); // N/A for this routine
+}
+
+// =================================================================================================
+
+// The generic implementation, also suited for other (non general) matrix-vector multiplications
+template <typename T>
+StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ bool fast_kernel, bool fast_kernel_rot,
+ const size_t parameter, const bool packed,
+ const size_t kl, const size_t ku) {
+
// Makes sure all dimensions are larger than zero
if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
@@ -62,6 +86,11 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
auto m_real = (a_transposed) ? n : m;
auto n_real = (a_transposed) ? m : n;
+ // Special adjustments for banded matrices
+ if (kl != 0 || ku != 0) {
+ a_one = kl+ku+1;
+ }
+
// Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
auto a_rotated = a_transposed ^ a_altlayout;
@@ -69,7 +98,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
auto a_conjugate = (a_transpose == Transpose::kConjugate);
// Tests the matrix and the vectors for validity
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+ auto status = StatusCode::kSuccess;
+ if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
+ else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
@@ -77,26 +108,26 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
if (ErrorIn(status)) { return status; }
// Determines whether or not the fast-version can be used
- bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
- IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
- IsMultiple(n, db_["WGS2"]) &&
- IsMultiple(a_ld, db_["VW2"]);
- bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
- IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
- IsMultiple(n, db_["WGS3"]) &&
- IsMultiple(a_ld, db_["VW3"]);
+ fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
+ IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
+ IsMultiple(n, db_["WGS2"]) &&
+ IsMultiple(a_ld, db_["VW2"]);
+ fast_kernel_rot = fast_kernel_rot && (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
+ IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
+ IsMultiple(n, db_["WGS3"]) &&
+ IsMultiple(a_ld, db_["VW3"]);
// If possible, run the fast-version (rotated or non-rotated) of the kernel
auto kernel_name = "Xgemv";
auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
auto global_size = m_ceiled / db_["WPT1"];
auto local_size = db_["WGS1"];
- if (use_fast_kernel) {
+ if (fast_kernel) {
kernel_name = "XgemvFast";
global_size = m_real / db_["WPT2"];
local_size = db_["WGS2"];
}
- if (use_fast_kernel_rot) {
+ if (fast_kernel_rot) {
kernel_name = "XgemvFastRot";
global_size = m_real / db_["WPT3"];
local_size = db_["WGS3"];
@@ -123,6 +154,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
kernel.SetArgument(12, static_cast<int>(y_offset));
kernel.SetArgument(13, static_cast<int>(y_inc));
kernel.SetArgument(14, static_cast<int>(a_conjugate));
+ kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
+ kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
+ kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
// Launches the kernel
auto global = std::vector<size_t>{global_size};
diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc
new file mode 100644
index 00000000..f59a7cb3
--- /dev/null
+++ b/src/routines/level2/xhbmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xhbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhbmv<T>::Xhbmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_HBMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, k, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhbmv<float2>;
+template class Xhbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc
index 2d92e45f..5a58b28b 100644
--- a/src/routines/level2/xhemv.cc
+++ b/src/routines/level2/xhemv.cc
@@ -37,57 +37,21 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
- // Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
-
- // Checks for validity of the squared A matrix
- auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T));
- if (ErrorIn(status)) { return status; }
-
- // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as
- // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
- bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
- (triangle == Triangle::kLower && layout == Layout::kRowMajor));
- auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
-
- // Temporary buffer for a copy of the hermitian matrix
- try {
- auto temp_herm = Buffer<T>(context_, n*n);
-
- // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemv
- // routine afterwards
- try {
- auto& program = GetProgramFromCache();
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the arguments for the hermitian-to-squared kernel
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, static_cast<int>(a_ld));
- kernel.SetArgument(2, static_cast<int>(a_offset));
- kernel.SetArgument(3, a_buffer());
- kernel.SetArgument(4, static_cast<int>(n));
- kernel.SetArgument(5, static_cast<int>(n));
- kernel.SetArgument(6, static_cast<int>(0));
- kernel.SetArgument(7, temp_herm());
-
- // Uses the common padding kernel's thread configuration. This is allowed, since the
- // hermitian-to-squared kernel uses the same parameters.
- auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
- Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
- auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- status = RunKernel(kernel, global, local);
- if (ErrorIn(status)) { return status; }
-
- // Runs the regular Xgemv code
- status = DoGemv(layout, Transpose::kNo, n, n, alpha,
- temp_herm, 0, n,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc);
-
- // Return the status of the Xgemv routine
- return status;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_HEMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc
new file mode 100644
index 00000000..2269255d
--- /dev/null
+++ b/src/routines/level2/xhpmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xhpmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhpmv<T>::Xhpmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_HPMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ ap_buffer, ap_offset, n,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, true, 0, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhpmv<float2>;
+template class Xhpmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc
new file mode 100644
index 00000000..457bd762
--- /dev/null
+++ b/src/routines/level2/xsbmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xsbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsbmv<T>::Xsbmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_SBMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, k, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsbmv<float>;
+template class Xsbmv<double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc
new file mode 100644
index 00000000..4f1a9c61
--- /dev/null
+++ b/src/routines/level2/xspmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xspmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xspmv<T>::Xspmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_SPMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ ap_buffer, ap_offset, n,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, true, 0, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xspmv<float>;
+template class Xspmv<double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc
index 2ccb51f6..ec12324b 100644
--- a/src/routines/level2/xsymv.cc
+++ b/src/routines/level2/xsymv.cc
@@ -37,57 +37,21 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
- // Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
-
- // Checks for validity of the squared A matrix
- auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T));
- if (ErrorIn(status)) { return status; }
-
- // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as
- // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
- bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
- (triangle == Triangle::kLower && layout == Layout::kRowMajor));
- auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
-
- // Temporary buffer for a copy of the symmetric matrix
- try {
- auto temp_symm = Buffer<T>(context_, n*n);
-
- // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemv
- // routine afterwards
- try {
- auto& program = GetProgramFromCache();
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the arguments for the symmetric-to-squared kernel
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, static_cast<int>(a_ld));
- kernel.SetArgument(2, static_cast<int>(a_offset));
- kernel.SetArgument(3, a_buffer());
- kernel.SetArgument(4, static_cast<int>(n));
- kernel.SetArgument(5, static_cast<int>(n));
- kernel.SetArgument(6, static_cast<int>(0));
- kernel.SetArgument(7, temp_symm());
-
- // Uses the common padding kernel's thread configuration. This is allowed, since the
- // symmetric-to-squared kernel uses the same parameters.
- auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
- Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
- auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- status = RunKernel(kernel, global, local);
- if (ErrorIn(status)) { return status; }
-
- // Runs the regular Xgemv code
- status = DoGemv(layout, Transpose::kNo, n, n, alpha,
- temp_symm, 0, n,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc);
-
- // Return the status of the Xgemv routine
- return status;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific symmetric matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_SYMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc
new file mode 100644
index 00000000..2e1aebff
--- /dev/null
+++ b/src/routines/level2/xtbmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtbmv<T>::Xtbmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+ // Creates a copy of X: a temporary scratch buffer
+ auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+ try {
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+ } catch (...) { } // Continues: error-code is returned in MatVec
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Adds '2' to the parameter if the diagonal is unit
+ auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_TBMV define.
+ auto fast_kernels = false;
+ auto status = MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ a_buffer, a_offset, a_ld,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, false, k, 0);
+
+ // Returns the proper error code (renames vector Y to X)
+ switch(status) {
+ case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
+ case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
+ case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+ default: return status;
+ }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtbmv<float>;
+template class Xtbmv<double>;
+template class Xtbmv<float2>;
+template class Xtbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc
new file mode 100644
index 00000000..aa0e099b
--- /dev/null
+++ b/src/routines/level2/xtpmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtpmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtpmv<T>::Xtpmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+ // Creates a copy of X: a temporary scratch buffer
+ auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+ try {
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+ } catch (...) { } // Continues: error-code is returned in MatVec
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Adds '2' to the parameter if the diagonal is unit
+ auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_TPMV define.
+ auto fast_kernels = false;
+ auto status = MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ ap_buffer, ap_offset, n,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, true, 0, 0);
+
+ // Returns the proper error code (renames vector Y to X)
+ switch(status) {
+ case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
+ case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
+ case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+ default: return status;
+ }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtpmv<float>;
+template class Xtpmv<double>;
+template class Xtpmv<float2>;
+template class Xtpmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc
new file mode 100644
index 00000000..94424743
--- /dev/null
+++ b/src/routines/level2/xtrmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtrmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrmv<T>::Xtrmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+ // Creates a copy of X: a temporary scratch buffer
+ auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+ try {
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+ } catch (...) { } // Continues: error-code is returned in MatVec
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Adds '2' to the parameter if the diagonal is unit
+ auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific triangular matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_TRMV define.
+ auto fast_kernels = false;
+ auto status = MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ a_buffer, a_offset, a_ld,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, false, 0, 0);
+
+ // Returns the proper error code (renames vector Y to X)
+ switch(status) {
+ case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
+ case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
+ case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+ default: return status;
+ }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrmv<float>;
+template class Xtrmv<double>;
+template class Xtrmv<float2>;
+template class Xtrmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index 3d6fe595..6a066518 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -34,7 +34,7 @@ class TuneXgemv {
static std::string GetSources() {
return
#include "../src/kernels/common.opencl"
- #include "../src/kernels/xgemv.opencl"
+ #include "../src/kernels/level2/xgemv.opencl"
;
}
diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc
index 839ac3b1..85e18381 100644
--- a/test/correctness/testblas.cc
+++ b/test/correctness/testblas.cc
@@ -57,12 +57,14 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+ ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
dot_source_.resize(std::max(max_mat, max_matvec) + max_offset);
PopulateVector(x_source_);
PopulateVector(y_source_);
PopulateVector(a_source_);
PopulateVector(b_source_);
PopulateVector(c_source_);
+ PopulateVector(ap_source_);
PopulateVector(dot_source_);
}
@@ -83,14 +85,16 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
auto a_mat1 = Buffer<T>(context_, args.a_size);
auto b_mat1 = Buffer<T>(context_, args.b_size);
auto c_mat1 = Buffer<T>(context_, args.c_size);
+ auto ap_mat1 = Buffer<T>(context_, args.ap_size);
auto dot1 = Buffer<T>(context_, args.dot_size);
x_vec1.Write(queue_, args.x_size, x_source_);
y_vec1.Write(queue_, args.y_size, y_source_);
a_mat1.Write(queue_, args.a_size, a_source_);
b_mat1.Write(queue_, args.b_size, b_source_);
c_mat1.Write(queue_, args.c_size, c_source_);
+ ap_mat1.Write(queue_, args.ap_size, ap_source_);
dot1.Write(queue_, args.dot_size, dot_source_);
- auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, dot1};
+ auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, dot1};
auto status1 = run_reference_(args, buffers1, queue_);
// Runs the CLBlast code
@@ -99,14 +103,16 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
auto a_mat2 = Buffer<T>(context_, args.a_size);
auto b_mat2 = Buffer<T>(context_, args.b_size);
auto c_mat2 = Buffer<T>(context_, args.c_size);
+ auto ap_mat2 = Buffer<T>(context_, args.ap_size);
auto dot2 = Buffer<T>(context_, args.dot_size);
x_vec2.Write(queue_, args.x_size, x_source_);
y_vec2.Write(queue_, args.y_size, y_source_);
a_mat2.Write(queue_, args.a_size, a_source_);
b_mat2.Write(queue_, args.b_size, b_source_);
c_mat2.Write(queue_, args.c_size, c_source_);
+ ap_mat2.Write(queue_, args.ap_size, ap_source_);
dot2.Write(queue_, args.dot_size, dot_source_);
- auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, dot2};
+ auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, dot2};
auto status2 = run_routine_(args, buffers2, queue_);
// Tests for equality of the two status codes
@@ -155,29 +161,33 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+ auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.dot_size*sizeof(T), nullptr,nullptr);
auto x_vec1 = Buffer<T>(x1);
auto y_vec1 = Buffer<T>(y1);
auto a_mat1 = Buffer<T>(a1);
auto b_mat1 = Buffer<T>(b1);
auto c_mat1 = Buffer<T>(c1);
+ auto ap_mat1 = Buffer<T>(ap1);
auto dot1 = Buffer<T>(d1);
auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+ auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.dot_size*sizeof(T), nullptr,nullptr);
auto x_vec2 = Buffer<T>(x2);
auto y_vec2 = Buffer<T>(y2);
auto a_mat2 = Buffer<T>(a2);
auto b_mat2 = Buffer<T>(b2);
auto c_mat2 = Buffer<T>(c2);
+ auto ap_mat2 = Buffer<T>(ap2);
auto dot2 = Buffer<T>(d2);
// Runs the two routines
- auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, dot1};
- auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, dot2};
+ auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, dot1};
+ auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, dot2};
auto status1 = run_reference_(args, buffers1, queue_);
auto status2 = run_routine_(args, buffers2, queue_);
diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h
index 9e1d110c..bfd1763c 100644
--- a/test/correctness/testblas.h
+++ b/test/correctness/testblas.h
@@ -49,6 +49,7 @@ class TestBlas: public Tester<T,U> {
const std::vector<size_t> kIncrements = { 1, 2, 7 };
const std::vector<size_t> kMatrixDims = { 7, 64 };
const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
+ const std::vector<size_t> kBandSizes = { 4, 19 };
const std::vector<size_t> kOffsets = GetOffsets();
const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_);
const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_);
@@ -90,6 +91,7 @@ class TestBlas: public Tester<T,U> {
std::vector<T> a_source_;
std::vector<T> b_source_;
std::vector<T> c_source_;
+ std::vector<T> ap_source_;
std::vector<T> dot_source_;
// The routine-specific functions passed to the tester
@@ -121,6 +123,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
auto ms = std::vector<size_t>{args.m};
auto ns = std::vector<size_t>{args.n};
auto ks = std::vector<size_t>{args.k};
+ auto kus = std::vector<size_t>{args.ku};
+ auto kls = std::vector<size_t>{args.kl};
auto layouts = std::vector<Layout>{args.layout};
auto a_transposes = std::vector<Transpose>{args.a_transpose};
auto b_transposes = std::vector<Transpose>{args.b_transpose};
@@ -137,6 +141,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
auto a_offsets = std::vector<size_t>{args.a_offset};
auto b_offsets = std::vector<size_t>{args.b_offset};
auto c_offsets = std::vector<size_t>{args.c_offset};
+ auto ap_offsets = std::vector<size_t>{args.ap_offset};
auto dot_offsets = std::vector<size_t>{args.dot_offset};
auto alphas = std::vector<U>{args.alpha};
auto betas = std::vector<U>{args.beta};
@@ -145,6 +150,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
auto a_sizes = std::vector<size_t>{args.a_size};
auto b_sizes = std::vector<size_t>{args.b_size};
auto c_sizes = std::vector<size_t>{args.c_size};
+ auto ap_sizes = std::vector<size_t>{args.ap_size};
// Sets the dimensions of the matrices or vectors depending on the BLAS level
auto dimensions = (C::BLASLevel() == 3) ? tester.kMatrixDims :
@@ -156,6 +162,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
if (option == kArgM) { ms = dimensions; }
if (option == kArgN) { ns = dimensions; }
if (option == kArgK) { ks = dimensions; }
+ if (option == kArgKU) { kus = tester.kBandSizes; }
+ if (option == kArgKL) { kls = tester.kBandSizes; }
if (option == kArgLayout) { layouts = tester.kLayouts; }
if (option == kArgATransp) { a_transposes = C::GetATransposes(tester.kTransposes); }
if (option == kArgBTransp) { b_transposes = C::GetBTransposes(tester.kTransposes); }
@@ -172,6 +180,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
if (option == kArgAOffset) { a_offsets = tester.kOffsets; }
if (option == kArgBOffset) { b_offsets = tester.kOffsets; }
if (option == kArgCOffset) { c_offsets = tester.kOffsets; }
+ if (option == kArgAPOffset) { ap_offsets = tester.kOffsets; }
if (option == kArgDotOffset) { dot_offsets = tester.kOffsets; }
if (option == kArgAlpha) { alphas = tester.kAlphaValues; }
if (option == kArgBeta) { betas = tester.kBetaValues; }
@@ -181,6 +190,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
if (option == kArgAOffset) { a_sizes = tester.kMatSizes; }
if (option == kArgBOffset) { b_sizes = tester.kMatSizes; }
if (option == kArgCOffset) { c_sizes = tester.kMatSizes; }
+ if (option == kArgAPOffset) { ap_sizes = tester.kMatSizes; }
}
// Loops over the test-cases from a data-layout point of view
@@ -197,21 +207,27 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
for (auto &m: ms) { r_args.m = m;
for (auto &n: ns) { r_args.n = n;
for (auto &k: ks) { r_args.k = k;
- for (auto &x_inc: x_incs) { r_args.x_inc = x_inc;
- for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset;
- for (auto &y_inc: y_incs) { r_args.y_inc = y_inc;
- for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset;
- for (auto &a_ld: a_lds) { r_args.a_ld = a_ld;
- for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset;
- for (auto &b_ld: b_lds) { r_args.b_ld = b_ld;
- for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset;
- for (auto &c_ld: c_lds) { r_args.c_ld = c_ld;
- for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset;
- for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset;
- for (auto &alpha: alphas) { r_args.alpha = alpha;
- for (auto &beta: betas) { r_args.beta = beta;
- C::SetSizes(r_args);
- regular_test_vector.push_back(r_args);
+ for (auto &ku: kus) { r_args.ku = ku;
+ for (auto &kl: kls) { r_args.kl = kl;
+ for (auto &x_inc: x_incs) { r_args.x_inc = x_inc;
+ for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset;
+ for (auto &y_inc: y_incs) { r_args.y_inc = y_inc;
+ for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset;
+ for (auto &a_ld: a_lds) { r_args.a_ld = a_ld;
+ for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset;
+ for (auto &b_ld: b_lds) { r_args.b_ld = b_ld;
+ for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset;
+ for (auto &c_ld: c_lds) { r_args.c_ld = c_ld;
+ for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset;
+ for (auto &ap_offset: ap_offsets) { r_args.ap_offset = ap_offset;
+ for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset;
+ for (auto &alpha: alphas) { r_args.alpha = alpha;
+ for (auto &beta: betas) { r_args.beta = beta;
+ C::SetSizes(r_args);
+ regular_test_vector.push_back(r_args);
+ }
+ }
+ }
}
}
}
@@ -232,14 +248,16 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<U>>{};
auto i_args = args;
- i_args.m = i_args.n = i_args.k = tester.kBufferSize;
+ i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
for (auto &x_size: x_sizes) { i_args.x_size = x_size;
for (auto &y_size: y_sizes) { i_args.y_size = y_size;
for (auto &a_size: a_sizes) { i_args.a_size = a_size;
for (auto &b_size: b_sizes) { i_args.b_size = b_size;
for (auto &c_size: c_sizes) { i_args.c_size = c_size;
- invalid_test_vector.push_back(i_args);
+ for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size;
+ invalid_test_vector.push_back(i_args);
+ }
}
}
}
diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc
index f792925e..350865f0 100644
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@@ -132,6 +132,8 @@ void Tester<T,U>::TestEnd() {
if (o == kArgM) { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); }
if (o == kArgN) { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); }
if (o == kArgK) { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); }
+ if (o == kArgKU) { fprintf(stdout, "%s=%lu ", kArgKU, entry.args.ku); }
+ if (o == kArgKL) { fprintf(stdout, "%s=%lu ", kArgKL, entry.args.kl); }
if (o == kArgLayout) { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);}
if (o == kArgATransp) { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);}
if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
@@ -148,6 +150,7 @@ void Tester<T,U>::TestEnd() {
if (o == kArgAOffset) { fprintf(stdout, "%s=%lu ", kArgAOffset, entry.args.a_offset);}
if (o == kArgBOffset) { fprintf(stdout, "%s=%lu ", kArgBOffset, entry.args.b_offset);}
if (o == kArgCOffset) { fprintf(stdout, "%s=%lu ", kArgCOffset, entry.args.c_offset);}
+ if (o == kArgAPOffset) { fprintf(stdout, "%s=%lu ", kArgAPOffset, entry.args.ap_offset);}
if (o == kArgDotOffset){ fprintf(stdout, "%s=%lu ", kArgDotOffset, entry.args.dot_offset);}
}
fprintf(stdout, "\n");
diff --git a/test/performance/client.cc b/test/performance/client.cc
index 9faa4dca..fb248854 100644
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@@ -48,9 +48,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
for (auto &o: options_) {
// Data-sizes
- if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); }
- if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); }
- if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); }
+ if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); }
+ if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); }
+ if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); }
+ if (o == kArgKU) { args.ku = GetArgument(argc, argv, help, kArgKU, 128UL); }
+ if (o == kArgKL) { args.kl = GetArgument(argc, argv, help, kArgKL, 128UL); }
// Data-layouts
if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@@ -73,6 +75,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
if (o == kArgAOffset) { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); }
if (o == kArgBOffset) { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); }
if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); }
+ if (o == kArgAPOffset) { args.ap_offset= GetArgument(argc, argv, help, kArgAPOffset, size_t{0}); }
// Dot arguments
if (o == kArgDotOffset) { args.dot_offset = GetArgument(argc, argv, help, kArgDotOffset, size_t{0}); }
@@ -131,12 +134,14 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
std::vector<T> a_source(args.a_size);
std::vector<T> b_source(args.b_size);
std::vector<T> c_source(args.c_size);
+ std::vector<T> ap_source(args.ap_size);
std::vector<T> dot_source(args.dot_size);
PopulateVector(x_source);
PopulateVector(y_source);
PopulateVector(a_source);
PopulateVector(b_source);
PopulateVector(c_source);
+ PopulateVector(ap_source);
PopulateVector(dot_source);
// Creates the matrices on the device
@@ -145,14 +150,16 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
auto a_mat = Buffer<T>(context, args.a_size);
auto b_mat = Buffer<T>(context, args.b_size);
auto c_mat = Buffer<T>(context, args.c_size);
+ auto ap_mat = Buffer<T>(context, args.ap_size);
auto dot = Buffer<T>(context, args.dot_size);
x_vec.Write(queue, args.x_size, x_source);
y_vec.Write(queue, args.y_size, y_source);
a_mat.Write(queue, args.a_size, a_source);
b_mat.Write(queue, args.b_size, b_source);
c_mat.Write(queue, args.c_size, c_source);
+ ap_mat.Write(queue, args.ap_size, ap_source);
dot.Write(queue, args.dot_size, dot_source);
- auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, dot};
+ auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, dot};
// Runs the routines and collects the timings
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
@@ -225,8 +232,10 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas
auto integers = std::vector<size_t>{};
for (auto &o: options_) {
if (o == kArgM) { integers.push_back(args.m); }
- if (o == kArgN) { integers.push_back(args.n); }
+ else if (o == kArgN) { integers.push_back(args.n); }
else if (o == kArgK) { integers.push_back(args.k); }
+ else if (o == kArgKU) { integers.push_back(args.ku); }
+ else if (o == kArgKL) { integers.push_back(args.kl); }
else if (o == kArgLayout) { integers.push_back(static_cast<size_t>(args.layout)); }
else if (o == kArgSide) { integers.push_back(static_cast<size_t>(args.side)); }
else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); }
@@ -243,6 +252,7 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas
else if (o == kArgAOffset) { integers.push_back(args.a_offset); }
else if (o == kArgBOffset) { integers.push_back(args.b_offset); }
else if (o == kArgCOffset) { integers.push_back(args.c_offset); }
+ else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); }
else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); }
}
auto strings = std::vector<std::string>{};
diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h
new file mode 100644
index 00000000..0e238804
--- /dev/null
+++ b/test/routines/level2/xgbmv.h
@@ -0,0 +1,140 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xgbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XGBMV_H_
+#define CLBLAST_TEST_ROUTINES_XGBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXgbmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgM, kArgN, kArgKL, kArgKU,
+ kArgLayout, kArgATransp,
+ kArgALeadDim, kArgXInc, kArgYInc,
+ kArgAOffset, kArgXOffset, kArgYOffset,
+ kArgAlpha, kArgBeta};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ auto a_transposed = (args.a_transpose != Transpose::kNo);
+ auto n_real = (a_transposed) ? args.m : args.n;
+ return n_real * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeY(const Arguments<T> &args) {
+ auto a_transposed = (args.a_transpose != Transpose::kNo);
+ auto m_real = (a_transposed) ? args.n : args.m;
+ return m_real * args.y_inc + args.y_offset;
+ }
+ static size_t GetSizeA(const Arguments<T> &args) {
+ auto a_rotated = (args.layout == Layout::kRowMajor);
+ auto a_two = (a_rotated) ? args.m : args.n;
+ return a_two * args.a_ld + args.a_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.a_size = GetSizeA(args);
+ args.x_size = GetSizeX(args);
+ args.y_size = GetSizeY(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &all) { return all; }
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Gbmv(args.layout, args.a_transpose,
+ args.m, args.n, args.kl, args.ku, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasTranspose>(args.a_transpose),
+ args.m, args.n, args.kl, args.ku, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.y_size, static_cast<T>(0));
+ buffers.y_vec.Read(queue, args.y_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ auto a_transposed = (args.a_transpose != Transpose::kNo);
+ return (a_transposed) ? args.n : args.m;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.y_inc + args.y_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.m * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ auto a_rotated = (args.layout == Layout::kRowMajor);
+ auto a_one = (a_rotated) ? args.n : args.m;
+ auto a_two = (a_rotated) ? args.m : args.n;
+ return ((args.kl+args.ku+1)*a_two + 2*a_one + a_two) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XGBMV_H_
+#endif
diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h
new file mode 100644
index 00000000..34e1502f
--- /dev/null
+++ b/test/routines/level2/xhbmv.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xhbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XHBMV_H_
+#define CLBLAST_TEST_ROUTINES_XHBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXhbmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgN, kArgKL,
+ kArgLayout, kArgTriangle,
+ kArgALeadDim, kArgXInc, kArgYInc,
+ kArgAOffset, kArgXOffset, kArgYOffset,
+ kArgAlpha, kArgBeta};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ return args.n * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeY(const Arguments<T> &args) {
+ return args.n * args.y_inc + args.y_offset;
+ }
+ static size_t GetSizeA(const Arguments<T> &args) {
+ return args.n * args.a_ld + args.a_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.a_size = GetSizeA(args);
+ args.x_size = GetSizeX(args);
+ args.y_size = GetSizeY(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Hbmv(args.layout, args.triangle,
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.y_size, static_cast<T>(0));
+ buffers.y_vec.Read(queue, args.y_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ return args.n;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.y_inc + args.y_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.n * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XHBMV_H_
+#endif
diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h
new file mode 100644
index 00000000..8fd85b62
--- /dev/null
+++ b/test/routines/level2/xhpmv.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xhpmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XHPMV_H_
+#define CLBLAST_TEST_ROUTINES_XHPMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXhpmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgN,
+ kArgLayout, kArgTriangle,
+ kArgXInc, kArgYInc,
+ kArgAPOffset, kArgXOffset, kArgYOffset,
+ kArgAlpha, kArgBeta};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ return args.n * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeY(const Arguments<T> &args) {
+ return args.n * args.y_inc + args.y_offset;
+ }
+ static size_t GetSizeAP(const Arguments<T> &args) {
+ return ((args.n*(args.n+1)) / 2) + args.ap_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.ap_size = GetSizeAP(args);
+ args.x_size = GetSizeX(args);
+ args.y_size = GetSizeY(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Hpmv(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.y_size, static_cast<T>(0));
+ buffers.y_vec.Read(queue, args.y_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ return args.n;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.y_inc + args.y_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.n * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XHPMV_H_
+#endif
diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h
new file mode 100644
index 00000000..5bc17e49
--- /dev/null
+++ b/test/routines/level2/xsbmv.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xsbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSBMV_H_
+#define CLBLAST_TEST_ROUTINES_XSBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXsbmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgN, kArgKL,
+ kArgLayout, kArgTriangle,
+ kArgALeadDim, kArgXInc, kArgYInc,
+ kArgAOffset, kArgXOffset, kArgYOffset,
+ kArgAlpha, kArgBeta};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ return args.n * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeY(const Arguments<T> &args) {
+ return args.n * args.y_inc + args.y_offset;
+ }
+ static size_t GetSizeA(const Arguments<T> &args) {
+ return args.n * args.a_ld + args.a_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.a_size = GetSizeA(args);
+ args.x_size = GetSizeX(args);
+ args.y_size = GetSizeY(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Sbmv(args.layout, args.triangle,
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.y_size, static_cast<T>(0));
+ buffers.y_vec.Read(queue, args.y_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ return args.n;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.y_inc + args.y_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.n * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSBMV_H_
+#endif
diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h
new file mode 100644
index 00000000..e335da42
--- /dev/null
+++ b/test/routines/level2/xspmv.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xspmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSPMV_H_
+#define CLBLAST_TEST_ROUTINES_XSPMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXspmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgN,
+ kArgLayout, kArgTriangle,
+ kArgXInc, kArgYInc,
+ kArgAPOffset, kArgXOffset, kArgYOffset,
+ kArgAlpha, kArgBeta};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ return args.n * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeY(const Arguments<T> &args) {
+ return args.n * args.y_inc + args.y_offset;
+ }
+ static size_t GetSizeAP(const Arguments<T> &args) {
+ return ((args.n*(args.n+1)) / 2) + args.ap_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.ap_size = GetSizeAP(args);
+ args.x_size = GetSizeX(args);
+ args.y_size = GetSizeY(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Spmv(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXspmv(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.y_size, static_cast<T>(0));
+ buffers.y_vec.Read(queue, args.y_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ return args.n;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.y_inc + args.y_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.n * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSPMV_H_
+#endif
diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h
new file mode 100644
index 00000000..dbdddb65
--- /dev/null
+++ b/test/routines/level2/xtbmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTBMV_H_
+#define CLBLAST_TEST_ROUTINES_XTBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtbmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgN, kArgKL,
+ kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+ kArgALeadDim, kArgXInc,
+ kArgAOffset, kArgXOffset};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ return args.n * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeA(const Arguments<T> &args) {
+ return args.n * args.a_ld + args.a_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.a_size = GetSizeA(args);
+ args.x_size = GetSizeX(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &all) { return all; }
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n, args.kl,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n, args.kl,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.x_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ return args.n;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.x_inc + args.x_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.n * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTBMV_H_
+#endif
diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h
new file mode 100644
index 00000000..4425765e
--- /dev/null
+++ b/test/routines/level2/xtpmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtpmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTPMV_H_
+#define CLBLAST_TEST_ROUTINES_XTPMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtpmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgN,
+ kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+ kArgXInc,
+ kArgAPOffset, kArgXOffset};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ return args.n * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeAP(const Arguments<T> &args) {
+ return ((args.n*(args.n+1)) / 2) + args.ap_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.ap_size = GetSizeAP(args);
+ args.x_size = GetSizeX(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &all) { return all; }
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.x_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ return args.n;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.x_inc + args.x_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.n * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTPMV_H_
+#endif
diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h
new file mode 100644
index 00000000..1c0c6fd8
--- /dev/null
+++ b/test/routines/level2/xtrmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtrmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTRMV_H_
+#define CLBLAST_TEST_ROUTINES_XTRMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtrmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgN,
+ kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+ kArgALeadDim, kArgXInc,
+ kArgAOffset, kArgXOffset};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ return args.n * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeA(const Arguments<T> &args) {
+ return args.n * args.a_ld + args.a_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.a_size = GetSizeA(args);
+ args.x_size = GetSizeX(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &all) { return all; }
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.x_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ return args.n;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.x_inc + args.x_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.n * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ return (args.n*args.n + 2*args.n + args.n) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTRMV_H_
+#endif
diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h
index 10c7dd47..23a02a45 100644
--- a/test/wrapper_clblas.h
+++ b/test/wrapper_clblas.h
@@ -238,7 +238,7 @@ clblasStatus clblasXdot<float>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float>(context, n);
+ auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
return clblasSdot(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -255,7 +255,7 @@ clblasStatus clblasXdot<double>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double>(context, n);
+ auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
return clblasDdot(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -281,7 +281,7 @@ clblasStatus clblasXdotu<float2>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float2>(context, n);
+ auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
return clblasCdotu(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -298,7 +298,7 @@ clblasStatus clblasXdotu<double2>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double2>(context, n);
+ auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
return clblasZdotu(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -324,7 +324,7 @@ clblasStatus clblasXdotc<float2>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float2>(context, n);
+ auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
return clblasCdotc(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -341,7 +341,7 @@ clblasStatus clblasXdotc<double2>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double2>(context, n);
+ auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
return clblasZdotc(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -747,7 +747,7 @@ clblasStatus clblasXtrmv<float>(const clblasOrder layout, const clblasUplo trian
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float>(context, n);
+ auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
return clblasStrmv(layout, triangle, a_transpose, diagonal,
n,
a_buffer, a_offset, a_ld,
@@ -764,7 +764,7 @@ clblasStatus clblasXtrmv<double>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double>(context, n);
+ auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
return clblasDtrmv(layout, triangle, a_transpose, diagonal,
n,
a_buffer, a_offset, a_ld,
@@ -781,7 +781,7 @@ clblasStatus clblasXtrmv<float2>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float2>(context, n);
+ auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
return clblasCtrmv(layout, triangle, a_transpose, diagonal,
n,
a_buffer, a_offset, a_ld,
@@ -798,7 +798,7 @@ clblasStatus clblasXtrmv<double2>(const clblasOrder layout, const clblasUplo tri
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double2>(context, n);
+ auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
return clblasZtrmv(layout, triangle, a_transpose, diagonal,
n,
a_buffer, a_offset, a_ld,
@@ -824,7 +824,7 @@ clblasStatus clblasXtbmv<float>(const clblasOrder layout, const clblasUplo trian
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float>(context, n);
+ auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
return clblasStbmv(layout, triangle, a_transpose, diagonal,
n, k,
a_buffer, a_offset, a_ld,
@@ -841,7 +841,7 @@ clblasStatus clblasXtbmv<double>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double>(context, n);
+ auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
return clblasDtbmv(layout, triangle, a_transpose, diagonal,
n, k,
a_buffer, a_offset, a_ld,
@@ -858,7 +858,7 @@ clblasStatus clblasXtbmv<float2>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float2>(context, n);
+ auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
return clblasCtbmv(layout, triangle, a_transpose, diagonal,
n, k,
a_buffer, a_offset, a_ld,
@@ -875,7 +875,7 @@ clblasStatus clblasXtbmv<double2>(const clblasOrder layout, const clblasUplo tri
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double2>(context, n);
+ auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
return clblasZtbmv(layout, triangle, a_transpose, diagonal,
n, k,
a_buffer, a_offset, a_ld,
@@ -901,7 +901,7 @@ clblasStatus clblasXtpmv<float>(const clblasOrder layout, const clblasUplo trian
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float>(context, n);
+ auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
return clblasStpmv(layout, triangle, a_transpose, diagonal,
n,
ap_buffer, ap_offset,
@@ -918,7 +918,7 @@ clblasStatus clblasXtpmv<double>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double>(context, n);
+ auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
return clblasDtpmv(layout, triangle, a_transpose, diagonal,
n,
ap_buffer, ap_offset,
@@ -935,7 +935,7 @@ clblasStatus clblasXtpmv<float2>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float2>(context, n);
+ auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
return clblasCtpmv(layout, triangle, a_transpose, diagonal,
n,
ap_buffer, ap_offset,
@@ -952,7 +952,7 @@ clblasStatus clblasXtpmv<double2>(const clblasOrder layout, const clblasUplo tri
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double2>(context, n);
+ auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
return clblasZtpmv(layout, triangle, a_transpose, diagonal,
n,
ap_buffer, ap_offset,