diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-05-13 20:49:34 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-05-13 20:49:34 +0200 |
commit | 120c31a30f933eea12d4dfffd4951fa22102ef5f (patch) | |
tree | 853aa6fae0522c9e92fce266c5fddb12a19dafd3 /src/kernels | |
parent | f2ba75890c522b4fe1762bfeac3e08667cf9588a (diff) |
Initial experimental version of the half-precision HAXPY routine
Diffstat (limited to 'src/kernels')
-rw-r--r-- | src/kernels/common.opencl | 18 | ||||
-rw-r--r-- | src/kernels/level1/xaxpy.opencl | 6 |
2 files changed, 19 insertions, 5 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 349f9e4f..df9ec35b 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -25,6 +25,11 @@ R"( // ================================================================================================= // Enable support for double-precision +#if PRECISION == 16 + #pragma OPENCL EXTENSION cl_khr_fp16: enable +#endif + +// Enable support for double-precision #if PRECISION == 64 || PRECISION == 6464 #if __OPENCL_VERSION__ <= CL_VERSION_1_1 #pragma OPENCL EXTENSION cl_khr_fp64: enable @@ -38,9 +43,9 @@ R"( typedef half4 real4; typedef half8 real8; typedef half16 real16; - #define ZERO 0.0 - #define ONE 1.0 - #define SMALLEST -1.0e37 + #define ZERO 0.0h + #define ONE 1.0h + #define SMALLEST -1.0e37h // Single-precision #elif PRECISION == 32 @@ -95,6 +100,13 @@ R"( #define SMALLEST -1.0e37 #endif +// Kernel argument scalar +#if PRECISION == 16 + typedef float realarg; +#else + typedef real realarg; +#endif + // Single-element version of a complex number #if PRECISION == 3232 typedef float singlereal; diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index 574beb43..58b7a196 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -23,9 +23,10 @@ R"( // Full version of the kernel with offsets and strided accesses __attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xaxpy(const int n, const real alpha, +__kernel void Xaxpy(const int n, const realarg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { + const real alpha = (real)arg_alpha; // Loops over the work that needs to be done (allows for an arbitrary number of threads) #pragma unroll @@ -40,9 +41,10 @@ __kernel void Xaxpy(const int n, const real alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. __attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void XaxpyFast(const int n, const real alpha, +__kernel void XaxpyFast(const int n, const realarg arg_alpha, const __global realV* restrict xgm, __global realV* ygm) { + const real alpha = (real)arg_alpha; #pragma unroll for (int w=0; w<WPT; ++w) { const int id = w*get_global_size(0) + get_global_id(0); |