summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG3
-rw-r--r--CMakeLists.txt2
-rw-r--r--README.md10
-rw-r--r--include/internal/routines/level2/xtbmv.h51
-rw-r--r--include/internal/routines/level2/xtpmv.h51
-rw-r--r--include/internal/routines/level2/xtrmv.h51
-rw-r--r--scripts/generator/generator.py10
-rw-r--r--src/clblast.cc63
-rw-r--r--src/kernels/level2/xgemv.opencl67
-rw-r--r--src/routines/level2/xtbmv.cc81
-rw-r--r--src/routines/level2/xtpmv.cc81
-rw-r--r--src/routines/level2/xtrmv.cc81
-rw-r--r--test/routines/level2/xtbmv.h125
-rw-r--r--test/routines/level2/xtpmv.h125
-rw-r--r--test/routines/level2/xtrmv.h125
-rw-r--r--test/wrapper_clblas.h36
16 files changed, 915 insertions, 47 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d4c9837c..c1981d7a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -14,6 +14,9 @@ Development version (next release)
* CHPMV/ZHPMV
* SSBMV/DSBMV
* SSPMV/DSPMV
+ * STRMV/DTRMV/CTRMV/ZTRMV
+ * STBMV/DTBMV/CTBMV/ZTBMV
+ * STPMV/DTPMV/CTPMV/ZTPMV
Version 0.4.0
- Now using the Claduc C++11 interface to OpenCL
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e670664..1ddd2f77 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,7 +106,7 @@ set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm)
set(SAMPLE_PROGRAMS_CPP sgemm)
set(SAMPLE_PROGRAMS_C sgemm)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
-set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv)
+set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
set(PRECISIONS 32 3232 64 6464)
diff --git a/README.md b/README.md
index 7d62c92f..8c7870a2 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ CLBlast: The tuned OpenCL BLAS library
CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
-__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
+__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
Why CLBlast and not clBLAS or cuBLAS?
@@ -130,7 +130,7 @@ These graphs can be generated automatically on your own device. First, compile C
Supported routines
-------------
-CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
+CLBlast is in active development but already supports the majority of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
| Level-1 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
@@ -160,9 +160,9 @@ CLBlast is in active development and currently does not support the full set of
| xSYMV | ✔ | ✔ | - | - | |
| xSBMV | ✔ | ✔ | - | - | |
| xSPMV | ✔ | ✔ | - | - | |
-| xTRMV | | | | | |
-| xTBMV | | | | | |
-| xTPMV | | | | | |
+| xTRMV | ✔ | ✔ | ✔ | ✔ | |
+| xTBMV | ✔ | ✔ | ✔ | ✔ | |
+| xTPMV | ✔ | ✔ | ✔ | ✔ | |
| xTRSV | | | | | |
| xTBSV | | | | | |
| xTPSV | | | | | |
diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h
new file mode 100644
index 00000000..89c90193
--- /dev/null
+++ b/include/internal/routines/level2/xtbmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTBMV_H_
+#define CLBLAST_ROUTINES_XTBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtbmv: public Xgemv<T> {
+ public:
+
+ // Members from the base class
+ using Routine<T>::queue_;
+ using Routine<T>::context_;
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoTbmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h
new file mode 100644
index 00000000..183d3505
--- /dev/null
+++ b/include/internal/routines/level2/xtpmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTPMV_H_
+#define CLBLAST_ROUTINES_XTPMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtpmv: public Xgemv<T> {
+ public:
+
+ // Members from the base class
+ using Routine<T>::queue_;
+ using Routine<T>::context_;
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoTpmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTPMV_H_
+#endif
diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h
new file mode 100644
index 00000000..dadfbc98
--- /dev/null
+++ b/include/internal/routines/level2/xtrmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMV_H_
+#define CLBLAST_ROUTINES_XTRMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmv: public Xgemv<T> {
+ public:
+
+ // Members from the base class
+ using Routine<T>::queue_;
+ using Routine<T>::context_;
+
+ // Uses the generic matrix-vector routine
+ using Xgemv<T>::MatVec;
+
+ // Constructor
+ Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoTrmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMV_H_
+#endif
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 338c8468..25f02861 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -71,9 +71,9 @@ routines = [
Routine(True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"),
Routine(True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"),
Routine(True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"),
- Routine(False, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"),
- Routine(False, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"),
- Routine(False, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"),
+ Routine(True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"),
+ Routine(True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"),
+ Routine(True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"),
Routine(False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"),
Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
@@ -213,7 +213,7 @@ def wrapper_clblas(routines):
if routine.scratch:
result += " auto queue = Queue(queues[0]);\n"
result += " auto context = queue.GetContext();\n"
- result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, n);\n"
+ result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, n*x_inc + x_offset);\n"
arguments += ["scratch_buffer()"]
result += " return clblas"+flavour.name+routine.name+"("
result += (",\n"+indent).join([a for a in arguments])
@@ -237,7 +237,7 @@ files = [
path_clblast+"/src/clblast_c.cc",
path_clblast+"/test/wrapper_clblas.h",
]
-header_lines = [84, 49, 80, 24, 22]
+header_lines = [84, 52, 80, 24, 22]
footer_lines = [6, 3, 5, 2, 6]
# Checks whether the command-line arguments are valid; exists otherwise
diff --git a/src/clblast.cc b/src/clblast.cc
index f7baf5e8..77999aaf 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -35,6 +35,9 @@
#include "internal/routines/level2/xsymv.h"
#include "internal/routines/level2/xsbmv.h"
#include "internal/routines/level2/xspmv.h"
+#include "internal/routines/level2/xtrmv.h"
+#include "internal/routines/level2/xtbmv.h"
+#include "internal/routines/level2/xtpmv.h"
// BLAS level-3 includes
#include "internal/routines/level3/xgemm.h"
@@ -628,12 +631,20 @@ template StatusCode Spmv<double>(const Layout, const Triangle,
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
template <typename T>
-StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal,
- const size_t,
- const cl_mem, const size_t, const size_t,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xtrmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoTrmv(layout, triangle, a_transpose, diagonal,
+ n,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc);
}
template StatusCode Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
@@ -658,12 +669,20 @@ template StatusCode Trmv<double2>(const Layout, const Triangle, const Transpose,
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
template <typename T>
-StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal,
- const size_t, const size_t,
- const cl_mem, const size_t, const size_t,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xtbmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoTbmv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc);
}
template StatusCode Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
@@ -688,12 +707,20 @@ template StatusCode Tbmv<double2>(const Layout, const Triangle, const Transpose,
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
template <typename T>
-StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal,
- const size_t,
- const cl_mem, const size_t,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xtpmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoTpmv(layout, triangle, a_transpose, diagonal,
+ n,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
}
template StatusCode Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index ab7802e5..8ed0e9e4 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -107,6 +107,19 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
#endif
}
+ // For triangular matrices
+ #elif defined(ROUTINE_TRMV)
+ if (((parameter == 0 || parameter == 2) && y <= x) ||
+ ((parameter == 1 || parameter == 3) && x <= y)) {
+ result = agm[a_ld*y + x + a_offset];
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+
// For symmetric/hermitian banded matrices
#elif defined(ROUTINE_HBMV) || defined(ROUTINE_SBMV)
if (parameter == 1) {
@@ -146,6 +159,35 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
}
}
+ // For triangular banded matrices
+ #elif defined(ROUTINE_TBMV)
+ if (parameter == 1 || parameter == 3) {
+ if (x <= y) {
+ const int m = kl - y;
+ if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; }
+ else { SetToZero(result); }
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+ else {
+ if (x >= y) {
+ const int m = -y;
+ if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; }
+ else { SetToZero(result); }
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+
// For symmetric/hermitian packed matrices
#elif defined(ROUTINE_HPMV) || defined(ROUTINE_SPMV)
if (parameter == 1) {
@@ -177,6 +219,31 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
}
}
+ // For triangular packed matrices
+ #elif defined(ROUTINE_TPMV)
+ if (parameter == 1 || parameter == 3) {
+ if (x <= y) {
+ result = agm[((y+1)*y)/2 + x + a_offset];
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+ else {
+ if (x >= y) {
+ result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset];
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+
// For general matrices
#else
result = agm[a_ld*y + x + a_offset];
diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc
new file mode 100644
index 00000000..2e1aebff
--- /dev/null
+++ b/src/routines/level2/xtbmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtbmv<T>::Xtbmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+ // Creates a copy of X: a temporary scratch buffer
+ auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+ try {
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+ } catch (...) { } // Continues: error-code is returned in MatVec
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Adds '2' to the parameter if the diagonal is unit
+ auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_TBMV define.
+ auto fast_kernels = false;
+ auto status = MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ a_buffer, a_offset, a_ld,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, false, k, 0);
+
+ // Returns the proper error code (renames vector Y to X)
+ switch(status) {
+ case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
+ case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
+ case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+ default: return status;
+ }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtbmv<float>;
+template class Xtbmv<double>;
+template class Xtbmv<float2>;
+template class Xtbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc
new file mode 100644
index 00000000..aa0e099b
--- /dev/null
+++ b/src/routines/level2/xtpmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtpmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtpmv<T>::Xtpmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+ // Creates a copy of X: a temporary scratch buffer
+ auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+ try {
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+ } catch (...) { } // Continues: error-code is returned in MatVec
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Adds '2' to the parameter if the diagonal is unit
+ auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_TPMV define.
+ auto fast_kernels = false;
+ auto status = MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ ap_buffer, ap_offset, n,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, true, 0, 0);
+
+ // Returns the proper error code (renames vector Y to X)
+ switch(status) {
+ case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
+ case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
+ case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+ default: return status;
+ }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtpmv<float>;
+template class Xtpmv<double>;
+template class Xtpmv<float2>;
+template class Xtpmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc
new file mode 100644
index 00000000..94424743
--- /dev/null
+++ b/src/routines/level2/xtrmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtrmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrmv<T>::Xtrmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+ // Creates a copy of X: a temporary scratch buffer
+ auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+ try {
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+ } catch (...) { } // Continues: error-code is returned in MatVec
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Adds '2' to the parameter if the diagonal is unit
+ auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific triangular matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_TRMV define.
+ auto fast_kernels = false;
+ auto status = MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ a_buffer, a_offset, a_ld,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, false, 0, 0);
+
+ // Returns the proper error code (renames vector Y to X)
+ switch(status) {
+ case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
+ case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
+ case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+ default: return status;
+ }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrmv<float>;
+template class Xtrmv<double>;
+template class Xtrmv<float2>;
+template class Xtrmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h
new file mode 100644
index 00000000..dbdddb65
--- /dev/null
+++ b/test/routines/level2/xtbmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTBMV_H_
+#define CLBLAST_TEST_ROUTINES_XTBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtbmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgN, kArgKL,
+ kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+ kArgALeadDim, kArgXInc,
+ kArgAOffset, kArgXOffset};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ return args.n * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeA(const Arguments<T> &args) {
+ return args.n * args.a_ld + args.a_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.a_size = GetSizeA(args);
+ args.x_size = GetSizeX(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &all) { return all; }
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n, args.kl,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n, args.kl,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.x_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ return args.n;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.x_inc + args.x_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.n * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTBMV_H_
+#endif
diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h
new file mode 100644
index 00000000..4425765e
--- /dev/null
+++ b/test/routines/level2/xtpmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtpmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTPMV_H_
+#define CLBLAST_TEST_ROUTINES_XTPMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtpmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgN,
+ kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+ kArgXInc,
+ kArgAPOffset, kArgXOffset};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ return args.n * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeAP(const Arguments<T> &args) {
+ return ((args.n*(args.n+1)) / 2) + args.ap_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.ap_size = GetSizeAP(args);
+ args.x_size = GetSizeX(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &all) { return all; }
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.x_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ return args.n;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.x_inc + args.x_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.n * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTPMV_H_
+#endif
diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h
new file mode 100644
index 00000000..1c0c6fd8
--- /dev/null
+++ b/test/routines/level2/xtrmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtrmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTRMV_H_
+#define CLBLAST_TEST_ROUTINES_XTRMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtrmv {
+ public:
+
+ // The BLAS level: 1, 2, or 3
+ static size_t BLASLevel() { return 2; }
+
+ // The list of arguments relevant for this routine
+ static std::vector<std::string> GetOptions() {
+ return {kArgN,
+ kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+ kArgALeadDim, kArgXInc,
+ kArgAOffset, kArgXOffset};
+ }
+
+ // Describes how to obtain the sizes of the buffers
+ static size_t GetSizeX(const Arguments<T> &args) {
+ return args.n * args.x_inc + args.x_offset;
+ }
+ static size_t GetSizeA(const Arguments<T> &args) {
+ return args.n * args.a_ld + args.a_offset;
+ }
+
+ // Describes how to set the sizes of all the buffers
+ static void SetSizes(Arguments<T> &args) {
+ args.a_size = GetSizeA(args);
+ args.x_size = GetSizeX(args);
+ }
+
+ // Describes what the default values of the leading dimensions of the matrices are
+ static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+ static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+ // Describes which transpose options are relevant for this routine
+ using Transposes = std::vector<Transpose>;
+ static Transposes GetATransposes(const Transposes &all) { return all; }
+ static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+ // Describes how to run the CLBlast routine
+ static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ clWaitForEvents(1, &event);
+ return status;
+ }
+
+ // Describes how to run the clBLAS routine (for correctness/performance comparison)
+ static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
+ static_cast<clblasUplo>(args.triangle),
+ static_cast<clblasTranspose>(args.a_transpose),
+ static_cast<clblasDiag>(args.diagonal),
+ args.n,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ 1, &queue_plain, 0, nullptr, &event);
+ clWaitForEvents(1, &event);
+ return static_cast<StatusCode>(status);
+ }
+
+ // Describes how to download the results of the computation (more importantly: which buffer)
+ static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ std::vector<T> result(args.x_size, static_cast<T>(0));
+ buffers.x_vec.Read(queue, args.x_size, result);
+ return result;
+ }
+
+ // Describes how to compute the indices of the result buffer
+ static size_t ResultID1(const Arguments<T> &args) {
+ return args.n;
+ }
+ static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+ static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+ return id1*args.x_inc + args.x_offset;
+ }
+
+ // Describes how to compute performance metrics
+ static size_t GetFlops(const Arguments<T> &args) {
+ return 2 * args.n * args.n;
+ }
+ static size_t GetBytes(const Arguments<T> &args) {
+ return (args.n*args.n + 2*args.n + args.n) * sizeof(T);
+ }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTRMV_H_
+#endif
diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h
index 10c7dd47..23a02a45 100644
--- a/test/wrapper_clblas.h
+++ b/test/wrapper_clblas.h
@@ -238,7 +238,7 @@ clblasStatus clblasXdot<float>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float>(context, n);
+ auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
return clblasSdot(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -255,7 +255,7 @@ clblasStatus clblasXdot<double>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double>(context, n);
+ auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
return clblasDdot(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -281,7 +281,7 @@ clblasStatus clblasXdotu<float2>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float2>(context, n);
+ auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
return clblasCdotu(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -298,7 +298,7 @@ clblasStatus clblasXdotu<double2>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double2>(context, n);
+ auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
return clblasZdotu(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -324,7 +324,7 @@ clblasStatus clblasXdotc<float2>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float2>(context, n);
+ auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
return clblasCdotc(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -341,7 +341,7 @@ clblasStatus clblasXdotc<double2>(const size_t n,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double2>(context, n);
+ auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
return clblasZdotc(n,
dot_buffer, dot_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
@@ -747,7 +747,7 @@ clblasStatus clblasXtrmv<float>(const clblasOrder layout, const clblasUplo trian
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float>(context, n);
+ auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
return clblasStrmv(layout, triangle, a_transpose, diagonal,
n,
a_buffer, a_offset, a_ld,
@@ -764,7 +764,7 @@ clblasStatus clblasXtrmv<double>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double>(context, n);
+ auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
return clblasDtrmv(layout, triangle, a_transpose, diagonal,
n,
a_buffer, a_offset, a_ld,
@@ -781,7 +781,7 @@ clblasStatus clblasXtrmv<float2>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float2>(context, n);
+ auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
return clblasCtrmv(layout, triangle, a_transpose, diagonal,
n,
a_buffer, a_offset, a_ld,
@@ -798,7 +798,7 @@ clblasStatus clblasXtrmv<double2>(const clblasOrder layout, const clblasUplo tri
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double2>(context, n);
+ auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
return clblasZtrmv(layout, triangle, a_transpose, diagonal,
n,
a_buffer, a_offset, a_ld,
@@ -824,7 +824,7 @@ clblasStatus clblasXtbmv<float>(const clblasOrder layout, const clblasUplo trian
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float>(context, n);
+ auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
return clblasStbmv(layout, triangle, a_transpose, diagonal,
n, k,
a_buffer, a_offset, a_ld,
@@ -841,7 +841,7 @@ clblasStatus clblasXtbmv<double>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double>(context, n);
+ auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
return clblasDtbmv(layout, triangle, a_transpose, diagonal,
n, k,
a_buffer, a_offset, a_ld,
@@ -858,7 +858,7 @@ clblasStatus clblasXtbmv<float2>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float2>(context, n);
+ auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
return clblasCtbmv(layout, triangle, a_transpose, diagonal,
n, k,
a_buffer, a_offset, a_ld,
@@ -875,7 +875,7 @@ clblasStatus clblasXtbmv<double2>(const clblasOrder layout, const clblasUplo tri
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double2>(context, n);
+ auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
return clblasZtbmv(layout, triangle, a_transpose, diagonal,
n, k,
a_buffer, a_offset, a_ld,
@@ -901,7 +901,7 @@ clblasStatus clblasXtpmv<float>(const clblasOrder layout, const clblasUplo trian
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float>(context, n);
+ auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
return clblasStpmv(layout, triangle, a_transpose, diagonal,
n,
ap_buffer, ap_offset,
@@ -918,7 +918,7 @@ clblasStatus clblasXtpmv<double>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double>(context, n);
+ auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
return clblasDtpmv(layout, triangle, a_transpose, diagonal,
n,
ap_buffer, ap_offset,
@@ -935,7 +935,7 @@ clblasStatus clblasXtpmv<float2>(const clblasOrder layout, const clblasUplo tria
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<float2>(context, n);
+ auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
return clblasCtpmv(layout, triangle, a_transpose, diagonal,
n,
ap_buffer, ap_offset,
@@ -952,7 +952,7 @@ clblasStatus clblasXtpmv<double2>(const clblasOrder layout, const clblasUplo tri
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
- auto scratch_buffer = Buffer<double2>(context, n);
+ auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
return clblasZtpmv(layout, triangle, a_transpose, diagonal,
n,
ap_buffer, ap_offset,