From ff0c54c3865b45eff807315262e73d3f01cb19c3 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Sat, 22 Aug 2015 17:11:20 +0200 Subject: Added the XSWAP, XSCAL and XCOPY level-1 routines --- src/clblast.cc | 129 ++++++++++++++++++++++--- src/clblast_c.cc | 198 ++++++++++++++++++++++++++++++++------- src/kernels/common.opencl | 9 +- src/kernels/level1/level1.opencl | 42 +++++++++ src/kernels/level1/xaxpy.opencl | 6 +- src/kernels/level1/xcopy.opencl | 57 +++++++++++ src/kernels/level1/xscal.opencl | 59 ++++++++++++ src/kernels/level1/xswap.opencl | 61 ++++++++++++ src/routines/level1/xcopy.cc | 117 +++++++++++++++++++++++ src/routines/level1/xscal.cc | 111 ++++++++++++++++++++++ src/routines/level1/xswap.cc | 117 +++++++++++++++++++++++ 11 files changed, 855 insertions(+), 51 deletions(-) create mode 100644 src/kernels/level1/xcopy.opencl create mode 100644 src/kernels/level1/xscal.opencl create mode 100644 src/kernels/level1/xswap.opencl create mode 100644 src/routines/level1/xcopy.cc create mode 100644 src/routines/level1/xscal.cc create mode 100644 src/routines/level1/xswap.cc (limited to 'src') diff --git a/src/clblast.cc b/src/clblast.cc index 12c7b880..c99ad7b1 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -18,6 +18,9 @@ #include "clblast.h" // BLAS level-1 includes +#include "internal/routines/level1/xswap.h" +#include "internal/routines/level1/xscal.h" +#include "internal/routines/level1/xcopy.h" #include "internal/routines/level1/xaxpy.h" // BLAS level-2 includes @@ -40,41 +43,139 @@ namespace clblast { // BLAS level-1 (vector-vector) routines // ================================================================================================= +// SWAP +template +StatusCode Swap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xswap(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSwap(n, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); +template StatusCode Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); +template StatusCode Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); +template StatusCode Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); + +// SCAL +template +StatusCode Scal(const size_t n, + const T alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xscal(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoScal(n, + alpha, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode Scal(const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); +template StatusCode Scal(const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); +template StatusCode Scal(const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); +template StatusCode Scal(const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); + +// COPY +template +StatusCode Copy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xcopy(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoCopy(n, + Buffer(x_buffer), x_offset, x_inc, + Buffer(y_buffer), y_offset, y_inc); +} +template StatusCode Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); +template StatusCode Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); +template StatusCode Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); +template StatusCode Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue* queue, cl_event* event); + // AXPY template -StatusCode Axpy(const size_t n, const T alpha, +StatusCode Axpy(const size_t n, + const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xaxpy(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoAxpy(n, alpha, + return routine.DoAxpy(n, + alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Axpy(const size_t, const float, +template StatusCode Axpy(const size_t, + const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Axpy(const size_t, const double, + cl_command_queue* queue, cl_event* event); +template StatusCode Axpy(const size_t, + const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Axpy(const size_t, const float2, + cl_command_queue* queue, cl_event* event); +template StatusCode Axpy(const size_t, + const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Axpy(const size_t, const double2, + cl_command_queue* queue, cl_event* event); +template StatusCode Axpy(const size_t, + const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); + cl_command_queue* queue, cl_event* event); // ================================================================================================= // BLAS level-2 (matrix-vector) routines diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 3b437aff..ab3994fb 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -19,10 +19,140 @@ extern "C" { #include "clblast.h" #include "internal/utilities.h" +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= +// SWAP +StatusCode CLBlastSswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// SCAL +StatusCode CLBlastSscal(const size_t n, + const float alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDscal(const size_t n, + const double alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCscal(const size_t n, + const cl_float2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZscal(const size_t n, + const cl_double2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// COPY +StatusCode CLBlastScopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + // AXPY StatusCode CLBlastSaxpy(const size_t n, const float alpha, @@ -54,7 +184,7 @@ StatusCode CLBlastCaxpy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Axpy(n, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); @@ -66,7 +196,7 @@ StatusCode CLBlastZaxpy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Axpy(n, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); @@ -127,10 +257,10 @@ StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, auto status = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); @@ -146,10 +276,10 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, auto status = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); @@ -167,10 +297,10 @@ StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, auto status = clblast::Hemv(static_cast(layout), static_cast(triangle), n, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); @@ -186,10 +316,10 @@ StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, auto status = clblast::Hemv(static_cast(layout), static_cast(triangle), n, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); @@ -292,10 +422,10 @@ StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const static_cast(a_transpose), static_cast(b_transpose), m, n, k, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -312,10 +442,10 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const static_cast(a_transpose), static_cast(b_transpose), m, n, k, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -374,10 +504,10 @@ StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle tri static_cast(side), static_cast(triangle), m, n, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -394,10 +524,10 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri static_cast(side), static_cast(triangle), m, n, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -416,10 +546,10 @@ StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle tri static_cast(side), static_cast(triangle), m, n, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -436,10 +566,10 @@ StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle tri static_cast(side), static_cast(triangle), m, n, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -493,9 +623,9 @@ StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Tran static_cast(triangle), static_cast(a_transpose), n, k, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -511,9 +641,9 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran static_cast(triangle), static_cast(a_transpose), n, k, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -610,10 +740,10 @@ StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Tra static_cast(triangle), static_cast(ab_transpose), n, k, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::float2{beta.s[0], beta.s[1]}, + float2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -630,10 +760,10 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra static_cast(triangle), static_cast(ab_transpose), n, k, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, - clblast::double2{beta.s[0], beta.s[1]}, + double2{beta.s[0], beta.s[1]}, c_buffer, c_offset, c_ld, queue, event); return static_cast(status); @@ -652,7 +782,7 @@ StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Tra static_cast(triangle), static_cast(ab_transpose), n, k, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, @@ -672,7 +802,7 @@ StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Tra static_cast(triangle), static_cast(ab_transpose), n, k, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, @@ -730,7 +860,7 @@ StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle tri static_cast(a_transpose), static_cast(diagonal), m, n, - clblast::float2{alpha.s[0], alpha.s[1]}, + float2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event); @@ -748,7 +878,7 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri static_cast(a_transpose), static_cast(diagonal), m, n, - clblast::double2{alpha.s[0], alpha.s[1]}, + double2{alpha.s[0], alpha.s[1]}, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, queue, event); diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 12d63b99..8e71429e 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -109,12 +109,19 @@ R"( #define SetToOne(a) a = ONE #endif -// Multiply two complex variables (used in the define below) +// Multiply two complex variables (used in the defines below) #if PRECISION == 3232 || PRECISION == 6464 #define MulReal(a, b) a.x*b.x - a.y*b.y #define MulImag(a, b) a.x*b.y + a.y*b.x #endif +// The scalar multiply function +#if PRECISION == 3232 || PRECISION == 6464 + #define Multiply(c, a, b) c.x = MulReal(a,b); c.y = MulImag(a,b) +#else + #define Multiply(c, a, b) c = a * b +#endif + // The scalar multiply-add function #if PRECISION == 3232 || PRECISION == 6464 #define MultiplyAdd(c, a, b) c.x += MulReal(a,b); c.y += MulImag(a,b) diff --git a/src/kernels/level1/level1.opencl b/src/kernels/level1/level1.opencl index 449a20a2..7e10426b 100644 --- a/src/kernels/level1/level1.opencl +++ b/src/kernels/level1/level1.opencl @@ -46,6 +46,48 @@ R"( // ================================================================================================= +// The vectorized multiply function +inline realV MultiplyVector(realV cvec, const real aval, const realV bvec) { + #if VW == 1 + Multiply(cvec, aval, bvec); + #elif VW == 2 + Multiply(cvec.x, aval, bvec.x); + Multiply(cvec.y, aval, bvec.y); + #elif VW == 4 + Multiply(cvec.x, aval, bvec.x); + Multiply(cvec.y, aval, bvec.y); + Multiply(cvec.z, aval, bvec.z); + Multiply(cvec.w, aval, bvec.w); + #elif VW == 8 + Multiply(cvec.s0, aval, bvec.s0); + Multiply(cvec.s1, aval, bvec.s1); + Multiply(cvec.s2, aval, bvec.s2); + Multiply(cvec.s3, aval, bvec.s3); + Multiply(cvec.s4, aval, bvec.s4); + Multiply(cvec.s5, aval, bvec.s5); + Multiply(cvec.s6, aval, bvec.s6); + Multiply(cvec.s7, aval, bvec.s7); + #elif VW == 16 + Multiply(cvec.s0, aval, bvec.s0); + Multiply(cvec.s1, aval, bvec.s1); + Multiply(cvec.s2, aval, bvec.s2); + Multiply(cvec.s3, aval, bvec.s3); + Multiply(cvec.s4, aval, bvec.s4); + Multiply(cvec.s5, aval, bvec.s5); + Multiply(cvec.s6, aval, bvec.s6); + Multiply(cvec.s7, aval, bvec.s7); + Multiply(cvec.s8, aval, bvec.s8); + Multiply(cvec.s9, aval, bvec.s9); + Multiply(cvec.sA, aval, bvec.sA); + Multiply(cvec.sB, aval, bvec.sB); + Multiply(cvec.sC, aval, bvec.sC); + Multiply(cvec.sD, aval, bvec.sD); + Multiply(cvec.sE, aval, bvec.sE); + Multiply(cvec.sF, aval, bvec.sF); + #endif + return cvec; +} + // The vectorized multiply-add function inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) { #if VW == 1 diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index 3d926d9e..1f1e8ce0 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -11,6 +11,8 @@ // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't // support vector data-types. // +// This kernel uses the level-1 BLAS common tuning parameters. +// // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string @@ -38,8 +40,8 @@ __kernel void Xaxpy(const int n, const real alpha, // dividable by 'VW', 'WGS' and 'WPT'. __attribute__((reqd_work_group_size(WGS, 1, 1))) __kernel void XaxpyFast(const int n, const real alpha, - const __global realV* restrict xgm, - __global realV* ygm) { + const __global realV* restrict xgm, + __global realV* ygm) { #pragma unroll for (int w=0; w +// +// This file contains the Xcopy kernel. It contains one fast vectorized version in case of unit +// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't +// support vector data-types. +// +// This kernel uses the level-1 BLAS common tuning parameters. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Full version of the kernel with offsets and strided accesses +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void Xcopy(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id +// +// This file contains the Xscal kernel. It contains one fast vectorized version in case of unit +// strides (incx=1) and no offsets (offx=0). Another version is more general, but doesn't support +// vector data-types. +// +// This kernel uses the level-1 BLAS common tuning parameters. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Full version of the kernel with offsets and strided accesses +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void Xscal(const int n, const real alpha, + __global real* xgm, const int x_offset, const int x_inc) { + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id +// +// This file contains the Xswap kernel. It contains one fast vectorized version in case of unit +// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't +// support vector data-types. +// +// This kernel uses the level-1 BLAS common tuning parameters. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Full version of the kernel with offsets and strided accesses +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void Xswap(const int n, + __global real* xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id +// +// This file implements the Xcopy class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xcopy.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xcopy::precision_ = Precision::kSingle; +template <> const Precision Xcopy::precision_ = Precision::kDouble; +template <> const Precision Xcopy::precision_ = Precision::kComplexSingle; +template <> const Precision Xcopy::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xcopy::Xcopy(Queue &queue, Event &event): + Routine(queue, event, "COPY", {"Xaxpy"}, precision_) { + source_string_ = + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xcopy.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xcopy::DoCopy(const size_t n, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy"; + + // Retrieves the Xcopy kernel from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast(x_offset)); + kernel.SetArgument(3, static_cast(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast(y_offset)); + kernel.SetArgument(6, static_cast(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xcopy; +template class Xcopy; +template class Xcopy; +template class Xcopy; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc new file mode 100644 index 00000000..13e1080c --- /dev/null +++ b/src/routines/level1/xscal.cc @@ -0,0 +1,111 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xscal class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xscal.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xscal::precision_ = Precision::kSingle; +template <> const Precision Xscal::precision_ = Precision::kDouble; +template <> const Precision Xscal::precision_ = Precision::kComplexSingle; +template <> const Precision Xscal::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xscal::Xscal(Queue &queue, Event &event): + Routine(queue, event, "SCAL", {"Xaxpy"}, precision_) { + source_string_ = + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xscal.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xscal::DoScal(const size_t n, const T alpha, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vector for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal"; + + // Retrieves the Xscal kernel from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, alpha); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast(x_offset)); + kernel.SetArgument(4, static_cast(x_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xscal; +template class Xscal; +template class Xscal; +template class Xscal; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc new file mode 100644 index 00000000..b22b3bdb --- /dev/null +++ b/src/routines/level1/xswap.cc @@ -0,0 +1,117 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xswap class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xswap.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xswap::precision_ = Precision::kSingle; +template <> const Precision Xswap::precision_ = Precision::kDouble; +template <> const Precision Xswap::precision_ = Precision::kComplexSingle; +template <> const Precision Xswap::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xswap::Xswap(Queue &queue, Event &event): + Routine(queue, event, "SWAP", {"Xaxpy"}, precision_) { + source_string_ = + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xswap.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xswap::DoSwap(const size_t n, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Determines whether or not the fast-version can be used + bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) && + (y_offset == 0) && (y_inc == 1) && + IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]); + + // If possible, run the fast-version of the kernel + auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap"; + + // Retrieves the Xswap kernel from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, y_buffer()); + } + else { + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, x_buffer()); + kernel.SetArgument(2, static_cast(x_offset)); + kernel.SetArgument(3, static_cast(x_inc)); + kernel.SetArgument(4, y_buffer()); + kernel.SetArgument(5, static_cast(y_offset)); + kernel.SetArgument(6, static_cast(y_inc)); + } + + // Launches the kernel + if (use_fast_kernel) { + auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + else { + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + status = RunKernel(kernel, global, local); + } + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xswap; +template class Xswap; +template class Xswap; +template class Xswap; + +// ================================================================================================= +} // namespace clblast -- cgit v1.2.3