diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-09-04 17:21:16 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-09-04 17:21:16 +0200 |
commit | b30b26b89e52eceb06f5661622c3de0312206ab4 (patch) | |
tree | 22fa403c54e5039cb8e34723d1e47007c71dcba5 | |
parent | 521bf6cdfc650f82488c1e07918eeabd7b328a78 (diff) |
The GEMM kernel no longer adds beta*C in case beta is zero; this would cause problems if C contains NaNs
-rw-r--r-- | CHANGELOG | 5 | ||||
-rw-r--r-- | src/kernels/common.opencl | 7 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part2.opencl | 124 |
3 files changed, 94 insertions, 42 deletions
@@ -1,15 +1,16 @@ Development version (next release) - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header +- Improved performance significantly of rotated GEMV computations +- Improved performance of unseen/un-tuned devices by a better default tuning parameter selection - Fixed proper MSVC dllimport and dllexport declarations - Fixed memory leaks related to events not being released - Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems - Fixed a bug related to the cache and retrieval of programs based on the OpenCL context - Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels - Fixed a bug in the OpenCL kernels: now placing __kernel before __attribute__ +- Fixed a bug in level-3 routines when beta is zero and matrix C contains NaNs - Added an option (-warm_up) to do a warm-up run before timing in the performance clients -- Improved performance significantly of rotated GEMV computations -- Improved performance of unseen/un-tuned devices by a better default tuning parameter selection - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 9d2bb65e..223501fd 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -148,6 +148,13 @@ R"( #define SetToOne(a) a = ONE #endif +// Determines whether a variable is zero +#if PRECISION == 3232 || PRECISION == 6464 + #define IsZero(a) ((a.x == ZERO) && (a.y == ZERO)) +#else + #define IsZero(a) (a == ZERO) +#endif + // The absolute value (component-wise) #if PRECISION == 3232 || PRECISION == 6464 #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y) diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index a1559b54..faf17e49 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -133,49 +133,93 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int #endif int idm = mg + GetGroupID0() * (MWG/VWM); int idn = ng + GetGroupID1() * NWG; - - // The final multiplication with alpha and the addition with beta*C int index = idn*(kSizeM/VWM) + idm; + realM result; realM xval = cpm[ni][mi]; - realM yval = cgm[index]; - #if VWM == 1 - AXPBY(result, alpha, xval, beta, yval); - #elif VWM == 2 - AXPBY(result.x, alpha, xval.x, beta, yval.x); - AXPBY(result.y, alpha, xval.y, beta, yval.y); - #elif VWM == 4 - AXPBY(result.x, alpha, xval.x, beta, yval.x); - AXPBY(result.y, alpha, xval.y, beta, yval.y); - AXPBY(result.z, alpha, xval.z, beta, yval.z); - AXPBY(result.w, alpha, xval.w, beta, yval.w); - #elif VWM == 8 - AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); - AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); - AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); - AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); - AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); - AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); - AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); - AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); - #elif VWM == 16 - AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); - AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); - AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); - AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); - AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); - AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); - AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); - AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); - AXPBY(result.s8, alpha, xval.s8, beta, yval.s8); - AXPBY(result.s9, alpha, xval.s9, beta, yval.s9); - AXPBY(result.sA, alpha, xval.sA, beta, yval.sA); - AXPBY(result.sB, alpha, xval.sB, beta, yval.sB); - AXPBY(result.sC, alpha, xval.sC, beta, yval.sC); - AXPBY(result.sD, alpha, xval.sD, beta, yval.sD); - AXPBY(result.sE, alpha, xval.sE, beta, yval.sE); - AXPBY(result.sF, alpha, xval.sF, beta, yval.sF); - #endif + + // The final multiplication with alpha (in case beta == 0) + if (IsZero(beta)) { + #if VWM == 1 + Multiply(result, alpha, xval); + #elif VWM == 2 + Multiply(result.x, alpha, xval.x); + Multiply(result.y, alpha, xval.y); + #elif VWM == 4 + Multiply(result.x, alpha, xval.x); + Multiply(result.y, alpha, xval.y); + Multiply(result.z, alpha, xval.z); + Multiply(result.w, alpha, xval.w); + #elif VWM == 8 + Multiply(result.s0, alpha, xval.s0); + Multiply(result.s1, alpha, xval.s1); + Multiply(result.s2, alpha, xval.s2); + Multiply(result.s3, alpha, xval.s3); + Multiply(result.s4, alpha, xval.s4); + Multiply(result.s5, alpha, xval.s5); + Multiply(result.s6, alpha, xval.s6); + Multiply(result.s7, alpha, xval.s7); + #elif VWM == 16 + Multiply(result.s0, alpha, xval.s0); + Multiply(result.s1, alpha, xval.s1); + Multiply(result.s2, alpha, xval.s2); + Multiply(result.s3, alpha, xval.s3); + Multiply(result.s4, alpha, xval.s4); + Multiply(result.s5, alpha, xval.s5); + Multiply(result.s6, alpha, xval.s6); + Multiply(result.s7, alpha, xval.s7); + Multiply(result.s8, alpha, xval.s8); + Multiply(result.s9, alpha, xval.s9); + Multiply(result.sA, alpha, xval.sA); + Multiply(result.sB, alpha, xval.sB); + Multiply(result.sC, alpha, xval.sC); + Multiply(result.sD, alpha, xval.sD); + Multiply(result.sE, alpha, xval.sE); + Multiply(result.sF, alpha, xval.sF); + #endif + } + + // The final multiplication with alpha and the addition with beta*C + else { + realM yval = cgm[index]; + #if VWM == 1 + AXPBY(result, alpha, xval, beta, yval); + #elif VWM == 2 + AXPBY(result.x, alpha, xval.x, beta, yval.x); + AXPBY(result.y, alpha, xval.y, beta, yval.y); + #elif VWM == 4 + AXPBY(result.x, alpha, xval.x, beta, yval.x); + AXPBY(result.y, alpha, xval.y, beta, yval.y); + AXPBY(result.z, alpha, xval.z, beta, yval.z); + AXPBY(result.w, alpha, xval.w, beta, yval.w); + #elif VWM == 8 + AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); + AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); + AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); + AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); + AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); + AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); + AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); + AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); + #elif VWM == 16 + AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); + AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); + AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); + AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); + AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); + AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); + AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); + AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); + AXPBY(result.s8, alpha, xval.s8, beta, yval.s8); + AXPBY(result.s9, alpha, xval.s9, beta, yval.s9); + AXPBY(result.sA, alpha, xval.sA, beta, yval.sA); + AXPBY(result.sB, alpha, xval.sB, beta, yval.sB); + AXPBY(result.sC, alpha, xval.sC, beta, yval.sC); + AXPBY(result.sD, alpha, xval.sD, beta, yval.sD); + AXPBY(result.sE, alpha, xval.sE, beta, yval.sE); + AXPBY(result.sF, alpha, xval.sF, beta, yval.sF); + #endif + } cgm[index] = result; } } |