summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2015-09-26 17:02:34 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2015-09-26 17:02:34 +0200
commit92b4b0d1feaaf92e160fa0342daf4269f24fb4d2 (patch)
tree3356b67a281d8292e893028d74a1801554ce0ef2 /src
parent42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff)
parent2b56c2c60325f02bc695cbb968049cc09205c713 (diff)
Merge pull request #27 from CNugteren/level2_matrix_vector
Added many level-2 matrix-vector routines
Diffstat (limited to 'src')
-rw-r--r--src/clblast.cc213
-rw-r--r--src/kernels/level2/xgemv.opencl (renamed from src/kernels/xgemv.opencl)208
-rw-r--r--src/routine.cc12
-rw-r--r--src/routines/level2/xgbmv.cc67
-rw-r--r--src/routines/level2/xgemv.cc60
-rw-r--r--src/routines/level2/xhbmv.cc64
-rw-r--r--src/routines/level2/xhemv.cc66
-rw-r--r--src/routines/level2/xhpmv.cc64
-rw-r--r--src/routines/level2/xsbmv.cc64
-rw-r--r--src/routines/level2/xspmv.cc64
-rw-r--r--src/routines/level2/xsymv.cc66
-rw-r--r--src/routines/level2/xtbmv.cc81
-rw-r--r--src/routines/level2/xtpmv.cc81
-rw-r--r--src/routines/level2/xtrmv.cc81
-rw-r--r--src/tuning/xgemv.cc2
15 files changed, 995 insertions, 198 deletions
diff --git a/src/clblast.cc b/src/clblast.cc
index a0dd8c70..77999aaf 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -28,8 +28,16 @@
// BLAS level-2 includes
#include "internal/routines/level2/xgemv.h"
+#include "internal/routines/level2/xgbmv.h"
#include "internal/routines/level2/xhemv.h"
+#include "internal/routines/level2/xhbmv.h"
+#include "internal/routines/level2/xhpmv.h"
#include "internal/routines/level2/xsymv.h"
+#include "internal/routines/level2/xsbmv.h"
+#include "internal/routines/level2/xspmv.h"
+#include "internal/routines/level2/xtrmv.h"
+#include "internal/routines/level2/xtbmv.h"
+#include "internal/routines/level2/xtpmv.h"
// BLAS level-3 includes
#include "internal/routines/level3/xgemm.h"
@@ -327,15 +335,26 @@ template StatusCode Gemv<double2>(const Layout, const Transpose,
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
template <typename T>
-StatusCode Gbmv(const Layout, const Transpose,
- const size_t, const size_t, const size_t, const size_t,
- const T,
- const cl_mem, const size_t, const size_t,
- const cl_mem, const size_t, const size_t,
- const T,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xgbmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoGbmv(layout, a_transpose,
+ m, n, kl, ku,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Gbmv<float>(const Layout, const Transpose,
const size_t, const size_t, const size_t, const size_t,
@@ -412,15 +431,26 @@ template StatusCode Hemv<double2>(const Layout, const Triangle,
// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
template <typename T>
-StatusCode Hbmv(const Layout, const Triangle,
- const size_t, const size_t,
- const T,
- const cl_mem, const size_t, const size_t,
- const cl_mem, const size_t, const size_t,
- const T,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Hbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xhbmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoHbmv(layout, triangle,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Hbmv<float2>(const Layout, const Triangle,
const size_t, const size_t,
@@ -441,15 +471,26 @@ template StatusCode Hbmv<double2>(const Layout, const Triangle,
// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
template <typename T>
-StatusCode Hpmv(const Layout, const Triangle,
- const size_t,
- const T,
- const cl_mem, const size_t,
- const cl_mem, const size_t, const size_t,
- const T,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Hpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xhpmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoHpmv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Hpmv<float2>(const Layout, const Triangle,
const size_t,
@@ -510,15 +551,26 @@ template StatusCode Symv<double>(const Layout, const Triangle,
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
template <typename T>
-StatusCode Sbmv(const Layout, const Triangle,
- const size_t, const size_t,
- const T,
- const cl_mem, const size_t, const size_t,
- const cl_mem, const size_t, const size_t,
- const T,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Sbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xsbmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoSbmv(layout, triangle,
+ n, k,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Sbmv<float>(const Layout, const Triangle,
const size_t, const size_t,
@@ -539,15 +591,26 @@ template StatusCode Sbmv<double>(const Layout, const Triangle,
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
template <typename T>
-StatusCode Spmv(const Layout, const Triangle,
- const size_t,
- const T,
- const cl_mem, const size_t,
- const cl_mem, const size_t, const size_t,
- const T,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Spmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xspmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoSpmv(layout, triangle,
+ n,
+ alpha,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Spmv<float>(const Layout, const Triangle,
const size_t,
@@ -568,12 +631,20 @@ template StatusCode Spmv<double>(const Layout, const Triangle,
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
template <typename T>
-StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal,
- const size_t,
- const cl_mem, const size_t, const size_t,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xtrmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoTrmv(layout, triangle, a_transpose, diagonal,
+ n,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc);
}
template StatusCode Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
@@ -598,12 +669,20 @@ template StatusCode Trmv<double2>(const Layout, const Triangle, const Transpose,
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
template <typename T>
-StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal,
- const size_t, const size_t,
- const cl_mem, const size_t, const size_t,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xtbmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoTbmv(layout, triangle, a_transpose, diagonal,
+ n, k,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc);
}
template StatusCode Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
@@ -628,12 +707,20 @@ template StatusCode Tbmv<double2>(const Layout, const Triangle, const Transpose,
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
template <typename T>
-StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal,
- const size_t,
- const cl_mem, const size_t,
- cl_mem, const size_t, const size_t,
- cl_command_queue*, cl_event*) {
- return StatusCode::kNotImplemented;
+StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const cl_mem ap_buffer, const size_t ap_offset,
+ cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto event_cpp = Event(*event);
+ auto routine = Xtpmv<T>(queue_cpp, event_cpp);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoTpmv(layout, triangle, a_transpose, diagonal,
+ n,
+ Buffer<T>(ap_buffer), ap_offset,
+ Buffer<T>(x_buffer), x_offset, x_inc);
}
template StatusCode Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
diff --git a/src/kernels/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 1e12dd78..8ed0e9e4 100644
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -79,22 +79,189 @@ R"(
#endif
// =================================================================================================
-// Defines how to load the input matrix in the regular case
-// Loads a scalar input value
+// Defines how to load the input matrix in the non-vectorized case
inline real LoadMatrixA(const __global real* restrict agm, const int x, const int y,
- const int a_ld, const int a_offset) {
- return agm[x + a_ld*y + a_offset];
+ const int a_ld, const int a_offset, const int parameter,
+ const int kl, const int ku) {
+ real result;
+
+ // For banded matrices
+ #if defined(ROUTINE_GBMV)
+ const int k = ku - y;
+ if (x >= y-ku && x < y+kl+1) { result = agm[a_ld*y + k + x + a_offset]; }
+ else { SetToZero(result); }
+
+ // For symmetric/hermitian matrices
+ #elif defined(ROUTINE_HEMV) || defined(ROUTINE_SYMV)
+ if ((parameter == 0 && y <= x) || (parameter == 1 && x <= y)) {
+ result = agm[a_ld*y + x + a_offset];
+ #if defined(ROUTINE_HEMV)
+ if (x == y) { result.y = ZERO; }
+ #endif
+ }
+ else {
+ result = agm[a_ld*x + y + a_offset];
+ #if defined(ROUTINE_HEMV)
+ COMPLEX_CONJUGATE(result);
+ #endif
+ }
+
+ // For triangular matrices
+ #elif defined(ROUTINE_TRMV)
+ if (((parameter == 0 || parameter == 2) && y <= x) ||
+ ((parameter == 1 || parameter == 3) && x <= y)) {
+ result = agm[a_ld*y + x + a_offset];
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+
+ // For symmetric/hermitian banded matrices
+ #elif defined(ROUTINE_HBMV) || defined(ROUTINE_SBMV)
+ if (parameter == 1) {
+ if (x <= y) {
+ const int m = kl - y;
+ if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; }
+ else { SetToZero(result); }
+ #if defined(ROUTINE_HBMV)
+ if (x == y) { result.y = ZERO; }
+ #endif
+ }
+ else {
+ const int m = kl - x;
+ if (y >= x-kl && y <= x) { result = agm[a_ld*x + m + y + a_offset]; }
+ else { SetToZero(result); }
+ #if defined(ROUTINE_HBMV)
+ COMPLEX_CONJUGATE(result);
+ #endif
+ }
+ }
+ else {
+ if (x >= y) {
+ const int m = -y;
+ if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; }
+ else { SetToZero(result); }
+ #if defined(ROUTINE_HBMV)
+ if (x == y) { result.y = ZERO; }
+ #endif
+ }
+ else {
+ const int m = -x;
+ if (y >= x && y < x+kl+1) { result = agm[a_ld*x + m + y + a_offset]; }
+ else { SetToZero(result); }
+ #if defined(ROUTINE_HBMV)
+ COMPLEX_CONJUGATE(result);
+ #endif
+ }
+ }
+
+ // For triangular banded matrices
+ #elif defined(ROUTINE_TBMV)
+ if (parameter == 1 || parameter == 3) {
+ if (x <= y) {
+ const int m = kl - y;
+ if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; }
+ else { SetToZero(result); }
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+ else {
+ if (x >= y) {
+ const int m = -y;
+ if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; }
+ else { SetToZero(result); }
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+
+ // For symmetric/hermitian packed matrices
+ #elif defined(ROUTINE_HPMV) || defined(ROUTINE_SPMV)
+ if (parameter == 1) {
+ if (x <= y) {
+ result = agm[((y+1)*y)/2 + x + a_offset];
+ #if defined(ROUTINE_HPMV)
+ if (x == y) { result.y = ZERO; }
+ #endif
+ }
+ else {
+ result = agm[((x+1)*x)/2 + y + a_offset];
+ #if defined(ROUTINE_HPMV)
+ COMPLEX_CONJUGATE(result);
+ #endif
+ }
+ }
+ else {
+ if (x >= y) {
+ result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset];
+ #if defined(ROUTINE_HPMV)
+ if (x == y) { result.y = ZERO; }
+ #endif
+ }
+ else {
+ result = agm[((2*a_ld-(x+1))*x)/2 + y + a_offset];
+ #if defined(ROUTINE_HPMV)
+ COMPLEX_CONJUGATE(result);
+ #endif
+ }
+ }
+
+ // For triangular packed matrices
+ #elif defined(ROUTINE_TPMV)
+ if (parameter == 1 || parameter == 3) {
+ if (x <= y) {
+ result = agm[((y+1)*y)/2 + x + a_offset];
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+ else {
+ if (x >= y) {
+ result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset];
+ if (parameter >= 2 && y == x) {
+ SetToOne(result);
+ }
+ }
+ else {
+ SetToZero(result);
+ }
+ }
+
+ // For general matrices
+ #else
+ result = agm[a_ld*y + x + a_offset];
+ #endif
+
+ return result;
}
+
// Loads a vector input value (1/2)
inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
const int a_ld) {
- return agm[x + a_ld*y];
+ return agm[a_ld*y + x];
}
+
// Loads a vector input value (2/2): as before, but different data-type
inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
const int a_ld) {
- return agm[x + a_ld*y];
+ return agm[a_ld*y + x];
}
// =================================================================================================
@@ -106,7 +273,8 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
const __global real* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
- const int do_conjugate) {
+ const int do_conjugate, const int parameter,
+ const int kl, const int ku) {
// Local memory for the vector X
__local real xlm[WGS1];
@@ -141,20 +309,20 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
// The multiply-add function for the main part (divisable by WGS1)
if (a_rotated == 0) { // Not rotated
#pragma unroll
- for (int kl=0; kl<WGS1; ++kl) {
- const int k = kwg + kl;
- real value = LoadMatrixA(agm, gid, k, a_ld, a_offset);
+ for (int kloop=0; kloop<WGS1; ++kloop) {
+ const int k = kwg + kloop;
+ real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
- MultiplyAdd(acc[w], xlm[kl], value);
+ MultiplyAdd(acc[w], xlm[kloop], value);
}
}
else { // Transposed
#pragma unroll
- for (int kl=0; kl<WGS1; ++kl) {
- const int k = kwg + kl;
- real value = LoadMatrixA(agm, k, gid, a_ld, a_offset);
+ for (int kloop=0; kloop<WGS1; ++kloop) {
+ const int k = kwg + kloop;
+ real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
- MultiplyAdd(acc[w], xlm[kl], value);
+ MultiplyAdd(acc[w], xlm[kloop], value);
}
}
}
@@ -174,7 +342,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
if (a_rotated == 0) { // Not rotated
#pragma unroll
for (int k=n_floor; k<n; ++k) {
- real value = LoadMatrixA(agm, gid, k, a_ld, a_offset);
+ real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value);
}
@@ -182,7 +350,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
else { // Transposed
#pragma unroll
for (int k=n_floor; k<n; ++k) {
- real value = LoadMatrixA(agm, k, gid, a_ld, a_offset);
+ real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value);
}
@@ -209,7 +377,8 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b
const __global realVF* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
- const int do_conjugate) {
+ const int do_conjugate, const int parameter,
+ const int kl, const int ku) {
// Local memory for the vector X
__local real xlm[WGS2];
@@ -305,7 +474,8 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
- const int do_conjugate) {
+ const int do_conjugate, const int parameter,
+ const int kl, const int ku) {
// Local memory for the vector X
__local real xlm[WGS3];
diff --git a/src/routine.cc b/src/routine.cc
index 05a03683..2978c94a 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -191,6 +191,18 @@ StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buf
return StatusCode::kSuccess;
}
+// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size
+template <typename T>
+StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer,
+ const size_t offset, const size_t data_size) {
+ try {
+ auto required_size = (((n*(n+1))/2) + offset)*data_size;
+ auto buffer_size = buffer.GetSize();
+ if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
+ } catch (...) { return StatusCode::kInvalidMatrixA; }
+ return StatusCode::kSuccess;
+}
+
// =================================================================================================
// Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc
new file mode 100644
index 00000000..14d391ca
--- /dev/null
+++ b/src/routines/level2/xgbmv.cc
@@ -0,0 +1,67 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xgbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const size_t kl, const size_t ku,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+ // Reverses the upper and lower band count
+ auto rotated = (layout == Layout::kRowMajor);
+ auto kl_real = (rotated) ? ku : kl;
+ auto ku_real = (rotated) ? kl : ku;
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_GBMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, a_transpose,
+ m, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ 0, false, kl_real, ku_real);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgbmv<float>;
+template class Xgbmv<double>;
+template class Xgbmv<float2>;
+template class Xgbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
index f95a9957..1b768dcd 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -32,8 +32,7 @@ template <typename T>
Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
source_string_ =
- #include "../../kernels/pad.opencl" // For {Herm,Symm}{Upper,Lower}ToSquared (for HEMV/SYMV)
- #include "../../kernels/xgemv.opencl"
+ #include "../../kernels/level2/xgemv.opencl"
;
}
@@ -49,6 +48,31 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+ // Performs the matrix-vector multiplication
+ return MatVec(layout, a_transpose,
+ m, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ true, true,
+ 0, false, 0, 0); // N/A for this routine
+}
+
+// =================================================================================================
+
+// The generic implementation, also suited for other (non general) matrix-vector multiplications
+template <typename T>
+StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+ bool fast_kernel, bool fast_kernel_rot,
+ const size_t parameter, const bool packed,
+ const size_t kl, const size_t ku) {
+
// Makes sure all dimensions are larger than zero
if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
@@ -62,6 +86,11 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
auto m_real = (a_transposed) ? n : m;
auto n_real = (a_transposed) ? m : n;
+ // Special adjustments for banded matrices
+ if (kl != 0 || ku != 0) {
+ a_one = kl+ku+1;
+ }
+
// Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
auto a_rotated = a_transposed ^ a_altlayout;
@@ -69,7 +98,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
auto a_conjugate = (a_transpose == Transpose::kConjugate);
// Tests the matrix and the vectors for validity
- auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+ auto status = StatusCode::kSuccess;
+ if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
+ else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
@@ -77,26 +108,26 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
if (ErrorIn(status)) { return status; }
// Determines whether or not the fast-version can be used
- bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
- IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
- IsMultiple(n, db_["WGS2"]) &&
- IsMultiple(a_ld, db_["VW2"]);
- bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
- IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
- IsMultiple(n, db_["WGS3"]) &&
- IsMultiple(a_ld, db_["VW3"]);
+ fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
+ IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
+ IsMultiple(n, db_["WGS2"]) &&
+ IsMultiple(a_ld, db_["VW2"]);
+ fast_kernel_rot = fast_kernel_rot && (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
+ IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
+ IsMultiple(n, db_["WGS3"]) &&
+ IsMultiple(a_ld, db_["VW3"]);
// If possible, run the fast-version (rotated or non-rotated) of the kernel
auto kernel_name = "Xgemv";
auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
auto global_size = m_ceiled / db_["WPT1"];
auto local_size = db_["WGS1"];
- if (use_fast_kernel) {
+ if (fast_kernel) {
kernel_name = "XgemvFast";
global_size = m_real / db_["WPT2"];
local_size = db_["WGS2"];
}
- if (use_fast_kernel_rot) {
+ if (fast_kernel_rot) {
kernel_name = "XgemvFastRot";
global_size = m_real / db_["WPT3"];
local_size = db_["WGS3"];
@@ -123,6 +154,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
kernel.SetArgument(12, static_cast<int>(y_offset));
kernel.SetArgument(13, static_cast<int>(y_inc));
kernel.SetArgument(14, static_cast<int>(a_conjugate));
+ kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
+ kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
+ kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
// Launches the kernel
auto global = std::vector<size_t>{global_size};
diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc
new file mode 100644
index 00000000..f59a7cb3
--- /dev/null
+++ b/src/routines/level2/xhbmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xhbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhbmv<T>::Xhbmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_HBMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, k, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhbmv<float2>;
+template class Xhbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc
index 2d92e45f..5a58b28b 100644
--- a/src/routines/level2/xhemv.cc
+++ b/src/routines/level2/xhemv.cc
@@ -37,57 +37,21 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
- // Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
-
- // Checks for validity of the squared A matrix
- auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T));
- if (ErrorIn(status)) { return status; }
-
- // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as
- // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
- bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
- (triangle == Triangle::kLower && layout == Layout::kRowMajor));
- auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
-
- // Temporary buffer for a copy of the hermitian matrix
- try {
- auto temp_herm = Buffer<T>(context_, n*n);
-
- // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemv
- // routine afterwards
- try {
- auto& program = GetProgramFromCache();
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the arguments for the hermitian-to-squared kernel
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, static_cast<int>(a_ld));
- kernel.SetArgument(2, static_cast<int>(a_offset));
- kernel.SetArgument(3, a_buffer());
- kernel.SetArgument(4, static_cast<int>(n));
- kernel.SetArgument(5, static_cast<int>(n));
- kernel.SetArgument(6, static_cast<int>(0));
- kernel.SetArgument(7, temp_herm());
-
- // Uses the common padding kernel's thread configuration. This is allowed, since the
- // hermitian-to-squared kernel uses the same parameters.
- auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
- Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
- auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- status = RunKernel(kernel, global, local);
- if (ErrorIn(status)) { return status; }
-
- // Runs the regular Xgemv code
- status = DoGemv(layout, Transpose::kNo, n, n, alpha,
- temp_herm, 0, n,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc);
-
- // Return the status of the Xgemv routine
- return status;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_HEMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc
new file mode 100644
index 00000000..2269255d
--- /dev/null
+++ b/src/routines/level2/xhpmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xhpmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhpmv<T>::Xhpmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_HPMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ ap_buffer, ap_offset, n,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, true, 0, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhpmv<float2>;
+template class Xhpmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc
new file mode 100644
index 00000000..457bd762
--- /dev/null
+++ b/src/routines/level2/xsbmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xsbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsbmv<T>::Xsbmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
+ const size_t n, const size_t k,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_SBMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, k, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsbmv<float>;
+template class Xsbmv<double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc
new file mode 100644
index 00000000..4f1a9c61
--- /dev/null
+++ b/src/routines/level2/xspmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xspmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xspmv<T>::Xspmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const T alpha,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const T beta,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_SPMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ ap_buffer, ap_offset, n,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, true, 0, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xspmv<float>;
+template class Xspmv<double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc
index 2ccb51f6..ec12324b 100644
--- a/src/routines/level2/xsymv.cc
+++ b/src/routines/level2/xsymv.cc
@@ -37,57 +37,21 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
- // Makes sure all dimensions are larger than zero
- if (n == 0) { return StatusCode::kInvalidDimension; }
-
- // Checks for validity of the squared A matrix
- auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T));
- if (ErrorIn(status)) { return status; }
-
- // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as
- // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
- bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
- (triangle == Triangle::kLower && layout == Layout::kRowMajor));
- auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
-
- // Temporary buffer for a copy of the symmetric matrix
- try {
- auto temp_symm = Buffer<T>(context_, n*n);
-
- // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemv
- // routine afterwards
- try {
- auto& program = GetProgramFromCache();
- auto kernel = Kernel(program, kernel_name);
-
- // Sets the arguments for the symmetric-to-squared kernel
- kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, static_cast<int>(a_ld));
- kernel.SetArgument(2, static_cast<int>(a_offset));
- kernel.SetArgument(3, a_buffer());
- kernel.SetArgument(4, static_cast<int>(n));
- kernel.SetArgument(5, static_cast<int>(n));
- kernel.SetArgument(6, static_cast<int>(0));
- kernel.SetArgument(7, temp_symm());
-
- // Uses the common padding kernel's thread configuration. This is allowed, since the
- // symmetric-to-squared kernel uses the same parameters.
- auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
- Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
- auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- status = RunKernel(kernel, global, local);
- if (ErrorIn(status)) { return status; }
-
- // Runs the regular Xgemv code
- status = DoGemv(layout, Transpose::kNo, n, n, alpha,
- temp_symm, 0, n,
- x_buffer, x_offset, x_inc, beta,
- y_buffer, y_offset, y_inc);
-
- // Return the status of the Xgemv routine
- return status;
- } catch (...) { return StatusCode::kInvalidKernel; }
- } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific symmetric matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_SYMV define.
+ bool fast_kernels = false;
+ return MatVec(layout, Transpose::kNo,
+ n, n, alpha,
+ a_buffer, a_offset, a_ld,
+ x_buffer, x_offset, x_inc, beta,
+ y_buffer, y_offset, y_inc,
+ fast_kernels, fast_kernels,
+ is_upper, false, 0, 0);
}
// =================================================================================================
diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc
new file mode 100644
index 00000000..2e1aebff
--- /dev/null
+++ b/src/routines/level2/xtbmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtbmv<T>::Xtbmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n, const size_t k,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+ // Creates a copy of X: a temporary scratch buffer
+ auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+ try {
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+ } catch (...) { } // Continues: error-code is returned in MatVec
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Adds '2' to the parameter if the diagonal is unit
+ auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_TBMV define.
+ auto fast_kernels = false;
+ auto status = MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ a_buffer, a_offset, a_ld,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, false, k, 0);
+
+ // Returns the proper error code (renames vector Y to X)
+ switch(status) {
+ case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
+ case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
+ case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+ default: return status;
+ }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtbmv<float>;
+template class Xtbmv<double>;
+template class Xtbmv<float2>;
+template class Xtbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc
new file mode 100644
index 00000000..aa0e099b
--- /dev/null
+++ b/src/routines/level2/xtpmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtpmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtpmv<T>::Xtpmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &ap_buffer, const size_t ap_offset,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+ // Creates a copy of X: a temporary scratch buffer
+ auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+ try {
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+ } catch (...) { } // Continues: error-code is returned in MatVec
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Adds '2' to the parameter if the diagonal is unit
+ auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_TPMV define.
+ auto fast_kernels = false;
+ auto status = MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ ap_buffer, ap_offset, n,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, true, 0, 0);
+
+ // Returns the proper error code (renames vector Y to X)
+ switch(status) {
+ case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
+ case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
+ case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+ default: return status;
+ }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtpmv<float>;
+template class Xtpmv<double>;
+template class Xtpmv<float2>;
+template class Xtpmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc
new file mode 100644
index 00000000..94424743
--- /dev/null
+++ b/src/routines/level2/xtrmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtrmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrmv<T>::Xtrmv(Queue &queue, Event &event, const std::string &name):
+ Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+ // Creates a copy of X: a temporary scratch buffer
+ auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+ try {
+ x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+ } catch (...) { } // Continues: error-code is returned in MatVec
+
+ // The data is either in the upper or lower triangle
+ size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Adds '2' to the parameter if the diagonal is unit
+ auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+ // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+ // The specific triangular matrix-accesses are implemented in the kernel guarded by the
+ // ROUTINE_TRMV define.
+ auto fast_kernels = false;
+ auto status = MatVec(layout, a_transpose,
+ n, n, static_cast<T>(1),
+ a_buffer, a_offset, a_ld,
+ scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+ x_buffer, x_offset, x_inc,
+ fast_kernels, fast_kernels,
+ parameter, false, 0, 0);
+
+ // Returns the proper error code (renames vector Y to X)
+ switch(status) {
+ case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
+ case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
+ case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+ default: return status;
+ }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrmv<float>;
+template class Xtrmv<double>;
+template class Xtrmv<float2>;
+template class Xtrmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index 3d6fe595..6a066518 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -34,7 +34,7 @@ class TuneXgemv {
static std::string GetSources() {
return
#include "../src/kernels/common.opencl"
- #include "../src/kernels/xgemv.opencl"
+ #include "../src/kernels/level2/xgemv.opencl"
;
}