diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/clblast.cc | 38 | ||||
-rw-r--r-- | src/clblast_c.cc | 16 | ||||
-rw-r--r-- | src/kernels/common.opencl | 4 | ||||
-rw-r--r-- | src/kernels/level1/xamax.opencl | 15 | ||||
-rw-r--r-- | src/kernels/level1/xasum.opencl | 5 |
5 files changed, 54 insertions, 24 deletions
diff --git a/src/clblast.cc b/src/clblast.cc index 4f2e6fb5..fac5a539 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -29,7 +29,9 @@ #include "internal/routines/level1/xdotc.h" #include "internal/routines/level1/xnrm2.h" #include "internal/routines/level1/xasum.h" +#include "internal/routines/level1/xsum.h" // non-BLAS function #include "internal/routines/level1/xamax.h" +#include "internal/routines/level1/xmax.h" // non-BLAS function // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" @@ -430,13 +432,19 @@ template StatusCode PUBLIC_API Asum<double2>(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM template <typename T> -StatusCode Sum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Sum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xsum<T>(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSum(n, + Buffer<T>(sum_buffer), sum_offset, + Buffer<T>(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Sum<float>(const size_t, cl_mem, const size_t, @@ -486,13 +494,19 @@ template StatusCode PUBLIC_API Amax<double2>(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX template <typename T> -StatusCode Max(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Max(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xmax<T>(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoMax(n, + Buffer<T>(imax_buffer), imax_offset, + Buffer<T>(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Max<float>(const size_t, cl_mem, const size_t, diff --git a/src/clblast_c.cc b/src/clblast_c.cc index e6270d57..72d93c4b 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -477,41 +477,41 @@ StatusCode CLBlastDzasum(const size_t n, // SUM StatusCode CLBlastSsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum<float>(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast<StatusCode>(status); } StatusCode CLBlastDsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum<double>(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast<StatusCode>(status); } StatusCode CLBlastScsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum<float2>(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast<StatusCode>(status); } StatusCode CLBlastDzsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum<double2>(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast<StatusCode>(status); diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 57d75ee0..d401744d 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -40,6 +40,7 @@ R"( typedef float16 real16; #define ZERO 0.0f #define ONE 1.0f + #define SMALLEST -1.0e37f // Double-precision #elif PRECISION == 64 @@ -50,6 +51,7 @@ R"( typedef double16 real16; #define ZERO 0.0 #define ONE 1.0 + #define SMALLEST -1.0e37 // Complex single-precision #elif PRECISION == 3232 @@ -64,6 +66,7 @@ R"( real sC; real sD; real sE; real sF;} real16; #define ZERO 0.0f #define ONE 1.0f + #define SMALLEST -1.0e37f // Complex Double-precision #elif PRECISION == 6464 @@ -78,6 +81,7 @@ R"( real sC; real sD; real sE; real sF;} real16; #define ZERO 0.0 #define ONE 1.0 + #define SMALLEST -1.0e37 #endif // Single-element version of a complex number diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 03dd05e5..58b75ce2 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -41,14 +41,23 @@ __kernel void Xamax(const int n, const int num_groups = get_num_groups(0); // Performs loading and the first steps of the reduction - singlereal max = ZERO; + #if defined(ROUTINE_MAX) // non-absolute version + singlereal max = SMALLEST; + #else + singlereal max = ZERO; + #endif unsigned int imax = 0; int id = wgid*WGS1 + lid; while (id < n) { + const int x_index = id*x_inc + x_offset; #if PRECISION == 3232 || PRECISION == 6464 - singlereal x = fabs(xgm[id*x_inc + x_offset].x); + singlereal x = xgm[x_index].x; #else - singlereal x = fabs(xgm[id*x_inc + x_offset]); + singlereal x = xgm[x_index]; + #endif + #if defined(ROUTINE_MAX) // non-absolute version + #else + x = fabs(x); #endif if (x >= max) { max = x; diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl index 037dc57e..58d0f11b 100644 --- a/src/kernels/level1/xasum.opencl +++ b/src/kernels/level1/xasum.opencl @@ -45,7 +45,10 @@ __kernel void Xasum(const int n, int id = wgid*WGS1 + lid; while (id < n) { real x = xgm[id*x_inc + x_offset]; - AbsoluteValue(x); + #if defined(ROUTINE_SUM) // non-absolute version + #else + AbsoluteValue(x); + #endif Add(acc, acc, x); id += WGS1*num_groups; } |