diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2015-09-26 17:02:34 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2015-09-26 17:02:34 +0200 |
commit | 92b4b0d1feaaf92e160fa0342daf4269f24fb4d2 (patch) | |
tree | 3356b67a281d8292e893028d74a1801554ce0ef2 /src | |
parent | 42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff) | |
parent | 2b56c2c60325f02bc695cbb968049cc09205c713 (diff) |
Merge pull request #27 from CNugteren/level2_matrix_vector
Added many level-2 matrix-vector routines
Diffstat (limited to 'src')
-rw-r--r-- | src/clblast.cc | 213 | ||||
-rw-r--r-- | src/kernels/level2/xgemv.opencl (renamed from src/kernels/xgemv.opencl) | 208 | ||||
-rw-r--r-- | src/routine.cc | 12 | ||||
-rw-r--r-- | src/routines/level2/xgbmv.cc | 67 | ||||
-rw-r--r-- | src/routines/level2/xgemv.cc | 60 | ||||
-rw-r--r-- | src/routines/level2/xhbmv.cc | 64 | ||||
-rw-r--r-- | src/routines/level2/xhemv.cc | 66 | ||||
-rw-r--r-- | src/routines/level2/xhpmv.cc | 64 | ||||
-rw-r--r-- | src/routines/level2/xsbmv.cc | 64 | ||||
-rw-r--r-- | src/routines/level2/xspmv.cc | 64 | ||||
-rw-r--r-- | src/routines/level2/xsymv.cc | 66 | ||||
-rw-r--r-- | src/routines/level2/xtbmv.cc | 81 | ||||
-rw-r--r-- | src/routines/level2/xtpmv.cc | 81 | ||||
-rw-r--r-- | src/routines/level2/xtrmv.cc | 81 | ||||
-rw-r--r-- | src/tuning/xgemv.cc | 2 |
15 files changed, 995 insertions, 198 deletions
diff --git a/src/clblast.cc b/src/clblast.cc index a0dd8c70..77999aaf 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -28,8 +28,16 @@ // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" +#include "internal/routines/level2/xgbmv.h" #include "internal/routines/level2/xhemv.h" +#include "internal/routines/level2/xhbmv.h" +#include "internal/routines/level2/xhpmv.h" #include "internal/routines/level2/xsymv.h" +#include "internal/routines/level2/xsbmv.h" +#include "internal/routines/level2/xspmv.h" +#include "internal/routines/level2/xtrmv.h" +#include "internal/routines/level2/xtbmv.h" +#include "internal/routines/level2/xtpmv.h" // BLAS level-3 includes #include "internal/routines/level3/xgemm.h" @@ -327,15 +335,26 @@ template StatusCode Gemv<double2>(const Layout, const Transpose, // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV template <typename T> -StatusCode Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const T, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const T, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xgbmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoGbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Gbmv<float>(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, @@ -412,15 +431,26 @@ template StatusCode Hemv<double2>(const Layout, const Triangle, // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template <typename T> -StatusCode Hbmv(const Layout, const Triangle, - const size_t, const size_t, - const T, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const T, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xhbmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHbmv(layout, triangle, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Hbmv<float2>(const Layout, const Triangle, const size_t, const size_t, @@ -441,15 +471,26 @@ template StatusCode Hbmv<double2>(const Layout, const Triangle, // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template <typename T> -StatusCode Hpmv(const Layout, const Triangle, - const size_t, - const T, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const T, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xhpmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoHpmv(layout, triangle, + n, + alpha, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Hpmv<float2>(const Layout, const Triangle, const size_t, @@ -510,15 +551,26 @@ template StatusCode Symv<double>(const Layout, const Triangle, // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV template <typename T> -StatusCode Sbmv(const Layout, const Triangle, - const size_t, const size_t, - const T, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const T, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xsbmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSbmv(layout, triangle, + n, k, + alpha, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Sbmv<float>(const Layout, const Triangle, const size_t, const size_t, @@ -539,15 +591,26 @@ template StatusCode Sbmv<double>(const Layout, const Triangle, // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV template <typename T> -StatusCode Spmv(const Layout, const Triangle, - const size_t, - const T, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const T, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xspmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSpmv(layout, triangle, + n, + alpha, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc, + beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Spmv<float>(const Layout, const Triangle, const size_t, @@ -568,12 +631,20 @@ template StatusCode Spmv<double>(const Layout, const Triangle, // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV template <typename T> -StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xtrmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoTrmv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); } template StatusCode Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, @@ -598,12 +669,20 @@ template StatusCode Trmv<double2>(const Layout, const Triangle, const Transpose, // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV template <typename T> -StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xtbmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoTbmv(layout, triangle, a_transpose, diagonal, + n, k, + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc); } template StatusCode Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, @@ -628,12 +707,20 @@ template StatusCode Tbmv<double2>(const Layout, const Triangle, const Transpose, // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV template <typename T> -StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xtpmv<T>(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoTpmv(layout, triangle, a_transpose, diagonal, + n, + Buffer<T>(ap_buffer), ap_offset, + Buffer<T>(x_buffer), x_offset, x_inc); } template StatusCode Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, diff --git a/src/kernels/xgemv.opencl b/src/kernels/level2/xgemv.opencl index 1e12dd78..8ed0e9e4 100644 --- a/src/kernels/xgemv.opencl +++ b/src/kernels/level2/xgemv.opencl @@ -79,22 +79,189 @@ R"( #endif // ================================================================================================= -// Defines how to load the input matrix in the regular case -// Loads a scalar input value +// Defines how to load the input matrix in the non-vectorized case inline real LoadMatrixA(const __global real* restrict agm, const int x, const int y, - const int a_ld, const int a_offset) { - return agm[x + a_ld*y + a_offset]; + const int a_ld, const int a_offset, const int parameter, + const int kl, const int ku) { + real result; + + // For banded matrices + #if defined(ROUTINE_GBMV) + const int k = ku - y; + if (x >= y-ku && x < y+kl+1) { result = agm[a_ld*y + k + x + a_offset]; } + else { SetToZero(result); } + + // For symmetric/hermitian matrices + #elif defined(ROUTINE_HEMV) || defined(ROUTINE_SYMV) + if ((parameter == 0 && y <= x) || (parameter == 1 && x <= y)) { + result = agm[a_ld*y + x + a_offset]; + #if defined(ROUTINE_HEMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + result = agm[a_ld*x + y + a_offset]; + #if defined(ROUTINE_HEMV) + COMPLEX_CONJUGATE(result); + #endif + } + + // For triangular matrices + #elif defined(ROUTINE_TRMV) + if (((parameter == 0 || parameter == 2) && y <= x) || + ((parameter == 1 || parameter == 3) && x <= y)) { + result = agm[a_ld*y + x + a_offset]; + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + + // For symmetric/hermitian banded matrices + #elif defined(ROUTINE_HBMV) || defined(ROUTINE_SBMV) + if (parameter == 1) { + if (x <= y) { + const int m = kl - y; + if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + const int m = kl - x; + if (y >= x-kl && y <= x) { result = agm[a_ld*x + m + y + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + else { + if (x >= y) { + const int m = -y; + if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + const int m = -x; + if (y >= x && y < x+kl+1) { result = agm[a_ld*x + m + y + a_offset]; } + else { SetToZero(result); } + #if defined(ROUTINE_HBMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + + // For triangular banded matrices + #elif defined(ROUTINE_TBMV) + if (parameter == 1 || parameter == 3) { + if (x <= y) { + const int m = kl - y; + if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + else { + if (x >= y) { + const int m = -y; + if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; } + else { SetToZero(result); } + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + + // For symmetric/hermitian packed matrices + #elif defined(ROUTINE_HPMV) || defined(ROUTINE_SPMV) + if (parameter == 1) { + if (x <= y) { + result = agm[((y+1)*y)/2 + x + a_offset]; + #if defined(ROUTINE_HPMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + result = agm[((x+1)*x)/2 + y + a_offset]; + #if defined(ROUTINE_HPMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + else { + if (x >= y) { + result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset]; + #if defined(ROUTINE_HPMV) + if (x == y) { result.y = ZERO; } + #endif + } + else { + result = agm[((2*a_ld-(x+1))*x)/2 + y + a_offset]; + #if defined(ROUTINE_HPMV) + COMPLEX_CONJUGATE(result); + #endif + } + } + + // For triangular packed matrices + #elif defined(ROUTINE_TPMV) + if (parameter == 1 || parameter == 3) { + if (x <= y) { + result = agm[((y+1)*y)/2 + x + a_offset]; + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + else { + if (x >= y) { + result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset]; + if (parameter >= 2 && y == x) { + SetToOne(result); + } + } + else { + SetToZero(result); + } + } + + // For general matrices + #else + result = agm[a_ld*y + x + a_offset]; + #endif + + return result; } + // Loads a vector input value (1/2) inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y, const int a_ld) { - return agm[x + a_ld*y]; + return agm[a_ld*y + x]; } + // Loads a vector input value (2/2): as before, but different data-type inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y, const int a_ld) { - return agm[x + a_ld*y]; + return agm[a_ld*y + x]; } // ================================================================================================= @@ -106,7 +273,8 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, const __global real* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate) { + const int do_conjugate, const int parameter, + const int kl, const int ku) { // Local memory for the vector X __local real xlm[WGS1]; @@ -141,20 +309,20 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, // The multiply-add function for the main part (divisable by WGS1) if (a_rotated == 0) { // Not rotated #pragma unroll - for (int kl=0; kl<WGS1; ++kl) { - const int k = kwg + kl; - real value = LoadMatrixA(agm, gid, k, a_ld, a_offset); + for (int kloop=0; kloop<WGS1; ++kloop) { + const int k = kwg + kloop; + real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku); if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } - MultiplyAdd(acc[w], xlm[kl], value); + MultiplyAdd(acc[w], xlm[kloop], value); } } else { // Transposed #pragma unroll - for (int kl=0; kl<WGS1; ++kl) { - const int k = kwg + kl; - real value = LoadMatrixA(agm, k, gid, a_ld, a_offset); + for (int kloop=0; kloop<WGS1; ++kloop) { + const int k = kwg + kloop; + real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku); if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } - MultiplyAdd(acc[w], xlm[kl], value); + MultiplyAdd(acc[w], xlm[kloop], value); } } } @@ -174,7 +342,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, if (a_rotated == 0) { // Not rotated #pragma unroll for (int k=n_floor; k<n; ++k) { - real value = LoadMatrixA(agm, gid, k, a_ld, a_offset); + real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku); if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value); } @@ -182,7 +350,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta, else { // Transposed #pragma unroll for (int k=n_floor; k<n; ++k) { - real value = LoadMatrixA(agm, k, gid, a_ld, a_offset); + real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku); if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value); } @@ -209,7 +377,8 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b const __global realVF* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate) { + const int do_conjugate, const int parameter, + const int kl, const int ku) { // Local memory for the vector X __local real xlm[WGS2]; @@ -305,7 +474,8 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea const __global realVFR* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate) { + const int do_conjugate, const int parameter, + const int kl, const int ku) { // Local memory for the vector X __local real xlm[WGS3]; diff --git a/src/routine.cc b/src/routine.cc index 05a03683..2978c94a 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -191,6 +191,18 @@ StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buf return StatusCode::kSuccess; } +// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size +template <typename T> +StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer, + const size_t offset, const size_t data_size) { + try { + auto required_size = (((n*(n+1))/2) + offset)*data_size; + auto buffer_size = buffer.GetSize(); + if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; } + } catch (...) { return StatusCode::kInvalidMatrixA; } + return StatusCode::kSuccess; +} + // ================================================================================================= // Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc new file mode 100644 index 00000000..14d391ca --- /dev/null +++ b/src/routines/level2/xgbmv.cc @@ -0,0 +1,67 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xgbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xgbmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Reverses the upper and lower band count + auto rotated = (layout == Layout::kRowMajor); + auto kl_real = (rotated) ? ku : kl; + auto ku_real = (rotated) ? kl : ku; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_GBMV define. + bool fast_kernels = false; + return MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + 0, false, kl_real, ku_real); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xgbmv<float>; +template class Xgbmv<double>; +template class Xgbmv<float2>; +template class Xgbmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index f95a9957..1b768dcd 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -32,8 +32,7 @@ template <typename T> Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name): Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) { source_string_ = - #include "../../kernels/pad.opencl" // For {Herm,Symm}{Upper,Lower}ToSquared (for HEMV/SYMV) - #include "../../kernels/xgemv.opencl" + #include "../../kernels/level2/xgemv.opencl" ; } @@ -49,6 +48,31 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, const T beta, const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + // Performs the matrix-vector multiplication + return MatVec(layout, a_transpose, + m, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + true, true, + 0, false, 0, 0); // N/A for this routine +} + +// ================================================================================================= + +// The generic implementation, also suited for other (non general) matrix-vector multiplications +template <typename T> +StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, + bool fast_kernel, bool fast_kernel_rot, + const size_t parameter, const bool packed, + const size_t kl, const size_t ku) { + // Makes sure all dimensions are larger than zero if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } @@ -62,6 +86,11 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, auto m_real = (a_transposed) ? n : m; auto n_real = (a_transposed) ? m : n; + // Special adjustments for banded matrices + if (kl != 0 || ku != 0) { + a_one = kl+ku+1; + } + // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator) auto a_rotated = a_transposed ^ a_altlayout; @@ -69,7 +98,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, auto a_conjugate = (a_transpose == Transpose::kConjugate); // Tests the matrix and the vectors for validity - auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + auto status = StatusCode::kSuccess; + if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); } + else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); } if (ErrorIn(status)) { return status; } status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T)); if (ErrorIn(status)) { return status; } @@ -77,26 +108,26 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, if (ErrorIn(status)) { return status; } // Determines whether or not the fast-version can be used - bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && - IsMultiple(n, db_["WGS2"]) && - IsMultiple(a_ld, db_["VW2"]); - bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && - IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && - IsMultiple(n, db_["WGS3"]) && - IsMultiple(a_ld, db_["VW3"]); + fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS2"]*db_["WPT2"]) && + IsMultiple(n, db_["WGS2"]) && + IsMultiple(a_ld, db_["VW2"]); + fast_kernel_rot = fast_kernel_rot && (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) && + IsMultiple(m, db_["WGS3"]*db_["WPT3"]) && + IsMultiple(n, db_["WGS3"]) && + IsMultiple(a_ld, db_["VW3"]); // If possible, run the fast-version (rotated or non-rotated) of the kernel auto kernel_name = "Xgemv"; auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]); auto global_size = m_ceiled / db_["WPT1"]; auto local_size = db_["WGS1"]; - if (use_fast_kernel) { + if (fast_kernel) { kernel_name = "XgemvFast"; global_size = m_real / db_["WPT2"]; local_size = db_["WGS2"]; } - if (use_fast_kernel_rot) { + if (fast_kernel_rot) { kernel_name = "XgemvFastRot"; global_size = m_real / db_["WPT3"]; local_size = db_["WGS3"]; @@ -123,6 +154,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, kernel.SetArgument(12, static_cast<int>(y_offset)); kernel.SetArgument(13, static_cast<int>(y_inc)); kernel.SetArgument(14, static_cast<int>(a_conjugate)); + kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm + kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices + kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices // Launches the kernel auto global = std::vector<size_t>{global_size}; diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc new file mode 100644 index 00000000..f59a7cb3 --- /dev/null +++ b/src/routines/level2/xhbmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xhbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xhbmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xhbmv<T>::Xhbmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HBMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhbmv<float2>; +template class Xhbmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc index 2d92e45f..5a58b28b 100644 --- a/src/routines/level2/xhemv.cc +++ b/src/routines/level2/xhemv.cc @@ -37,57 +37,21 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle, const T beta, const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared"; - - // Temporary buffer for a copy of the hermitian matrix - try { - auto temp_herm = Buffer<T>(context_, n*n); - - // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemv - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the hermitian-to-squared kernel - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, static_cast<int>(a_ld)); - kernel.SetArgument(2, static_cast<int>(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast<int>(n)); - kernel.SetArgument(5, static_cast<int>(n)); - kernel.SetArgument(6, static_cast<int>(0)); - kernel.SetArgument(7, temp_herm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // hermitian-to-squared kernel uses the same parameters. - auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemv code - status = DoGemv(layout, Transpose::kNo, n, n, alpha, - temp_herm, 0, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc); - - // Return the status of the Xgemv routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HEMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc new file mode 100644 index 00000000..2269255d --- /dev/null +++ b/src/routines/level2/xhpmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xhpmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xhpmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xhpmv<T>::Xhpmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_HPMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xhpmv<float2>; +template class Xhpmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc new file mode 100644 index 00000000..457bd762 --- /dev/null +++ b/src/routines/level2/xsbmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xsbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xsbmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xsbmv<T>::Xsbmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SBMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, k, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsbmv<float>; +template class Xsbmv<double>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc new file mode 100644 index 00000000..4f1a9c61 --- /dev/null +++ b/src/routines/level2/xspmv.cc @@ -0,0 +1,64 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xspmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xspmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xspmv<T>::Xspmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SPMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + ap_buffer, ap_offset, n, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, true, 0, 0); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xspmv<float>; +template class Xspmv<double>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc index 2ccb51f6..ec12324b 100644 --- a/src/routines/level2/xsymv.cc +++ b/src/routines/level2/xsymv.cc @@ -37,57 +37,21 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle, const T beta, const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { - // Makes sure all dimensions are larger than zero - if (n == 0) { return StatusCode::kInvalidDimension; } - - // Checks for validity of the squared A matrix - auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); - if (ErrorIn(status)) { return status; } - - // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as - // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix - bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared"; - - // Temporary buffer for a copy of the symmetric matrix - try { - auto temp_symm = Buffer<T>(context_, n*n); - - // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemv - // routine afterwards - try { - auto& program = GetProgramFromCache(); - auto kernel = Kernel(program, kernel_name); - - // Sets the arguments for the symmetric-to-squared kernel - kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, static_cast<int>(a_ld)); - kernel.SetArgument(2, static_cast<int>(a_offset)); - kernel.SetArgument(3, a_buffer()); - kernel.SetArgument(4, static_cast<int>(n)); - kernel.SetArgument(5, static_cast<int>(n)); - kernel.SetArgument(6, static_cast<int>(0)); - kernel.SetArgument(7, temp_symm()); - - // Uses the common padding kernel's thread configuration. This is allowed, since the - // symmetric-to-squared kernel uses the same parameters. - auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); - if (ErrorIn(status)) { return status; } - - // Runs the regular Xgemv code - status = DoGemv(layout, Transpose::kNo, n, n, alpha, - temp_symm, 0, n, - x_buffer, x_offset, x_inc, beta, - y_buffer, y_offset, y_inc); - - // Return the status of the Xgemv routine - return status; - } catch (...) { return StatusCode::kInvalidKernel; } - } catch (...) { return StatusCode::kTempBufferAllocFailure; } + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific symmetric matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_SYMV define. + bool fast_kernels = false; + return MatVec(layout, Transpose::kNo, + n, n, alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, beta, + y_buffer, y_offset, y_inc, + fast_kernels, fast_kernels, + is_upper, false, 0, 0); } // ================================================================================================= diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc new file mode 100644 index 00000000..2e1aebff --- /dev/null +++ b/src/routines/level2/xtbmv.cc @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtbmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xtbmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xtbmv<T>::Xtbmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TBMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, k, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtbmv<float>; +template class Xtbmv<double>; +template class Xtbmv<float2>; +template class Xtbmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc new file mode 100644 index 00000000..aa0e099b --- /dev/null +++ b/src/routines/level2/xtpmv.cc @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtpmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xtpmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xtpmv<T>::Xtpmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &ap_buffer, const size_t ap_offset, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TPMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + ap_buffer, ap_offset, n, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, true, 0, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtpmv<float>; +template class Xtpmv<double>; +template class Xtpmv<float2>; +template class Xtpmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc new file mode 100644 index 00000000..94424743 --- /dev/null +++ b/src/routines/level2/xtrmv.cc @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtrmv class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level2/xtrmv.h" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xtrmv<T>::Xtrmv(Queue &queue, Event &event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Creates a copy of X: a temporary scratch buffer + auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); + try { + x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + } catch (...) { } // Continues: error-code is returned in MatVec + + // The data is either in the upper or lower triangle + size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Adds '2' to the parameter if the diagonal is unit + auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper; + + // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels. + // The specific triangular matrix-accesses are implemented in the kernel guarded by the + // ROUTINE_TRMV define. + auto fast_kernels = false; + auto status = MatVec(layout, a_transpose, + n, n, static_cast<T>(1), + a_buffer, a_offset, a_ld, + scratch_buffer, x_offset, x_inc, static_cast<T>(0), + x_buffer, x_offset, x_inc, + fast_kernels, fast_kernels, + parameter, false, 0, 0); + + // Returns the proper error code (renames vector Y to X) + switch(status) { + case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX; + case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX; + case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX; + default: return status; + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrmv<float>; +template class Xtrmv<double>; +template class Xtrmv<float2>; +template class Xtrmv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc index 3d6fe595..6a066518 100644 --- a/src/tuning/xgemv.cc +++ b/src/tuning/xgemv.cc @@ -34,7 +34,7 @@ class TuneXgemv { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/xgemv.opencl" + #include "../src/kernels/level2/xgemv.opencl" ; } |