From 442c31dd508c573023594a803160ddb69d4929f2 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 8 Jul 2017 17:12:16 +0200 Subject: Made the inline keyword in kernels optional currently only enabled for NVIDIA and ARM GPUs --- src/kernels/common.opencl | 19 +++++--- src/kernels/level1/level1.opencl | 4 +- src/kernels/level2/level2.opencl | 24 +++++----- src/kernels/level2/xgemv.opencl | 6 +-- src/kernels/level2/xgemv_fast.opencl | 4 +- src/kernels/level3/copy_pad.opencl | 34 +++++++------- src/kernels/level3/invert_diagonal_blocks.opencl | 18 ++++---- src/kernels/level3/transpose_pad.opencl | 38 ++++++++-------- src/kernels/level3/xgemm_direct_part1.opencl | 56 ++++++++++++------------ src/kernels/level3/xgemm_direct_part2.opencl | 40 ++++++++--------- src/kernels/level3/xgemm_direct_part3.opencl | 18 ++++---- src/kernels/level3/xgemm_part1.opencl | 22 +++++----- src/kernels/level3/xgemm_part2.opencl | 8 ++-- src/kernels/level3/xgemm_part3.opencl | 22 +++++----- src/routine.cpp | 8 +++- 15 files changed, 168 insertions(+), 153 deletions(-) diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index db4c8ec4..9481881e 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -235,6 +235,15 @@ R"( // ================================================================================================= +// Force inlining functions or not: some compilers don't support the inline keyword +#ifdef USE_INLINE_KEYWORD + #define INLINE_FUNC inline +#else + #define INLINE_FUNC +#endif + +// ================================================================================================= + // Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is // enabled (see src/routine.cc). #ifndef USE_STAGGERED_INDICES @@ -245,18 +254,18 @@ R"( // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf // More details: https://github.com/CNugteren/CLBlast/issues/53 #if USE_STAGGERED_INDICES == 1 - inline size_t GetGroupIDFlat() { + INLINE_FUNC size_t GetGroupIDFlat() { return get_group_id(0) + get_num_groups(0) * get_group_id(1); } - inline size_t GetGroupID1() { + INLINE_FUNC size_t GetGroupID1() { return (GetGroupIDFlat()) % get_num_groups(1); } - inline size_t GetGroupID0() { + INLINE_FUNC size_t GetGroupID0() { return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0); } #else - inline size_t GetGroupID1() { return get_group_id(1); } - inline size_t GetGroupID0() { return get_group_id(0); } + INLINE_FUNC size_t GetGroupID1() { return get_group_id(1); } + INLINE_FUNC size_t GetGroupID0() { return get_group_id(0); } #endif // ================================================================================================= diff --git a/src/kernels/level1/level1.opencl b/src/kernels/level1/level1.opencl index 7e10426b..3c60c54a 100644 --- a/src/kernels/level1/level1.opencl +++ b/src/kernels/level1/level1.opencl @@ -47,7 +47,7 @@ R"( // ================================================================================================= // The vectorized multiply function -inline realV MultiplyVector(realV cvec, const real aval, const realV bvec) { +INLINE_FUNC realV MultiplyVector(realV cvec, const real aval, const realV bvec) { #if VW == 1 Multiply(cvec, aval, bvec); #elif VW == 2 @@ -89,7 +89,7 @@ inline realV MultiplyVector(realV cvec, const real aval, const realV bvec) { } // The vectorized multiply-add function -inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) { +INLINE_FUNC realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) { #if VW == 1 MultiplyAdd(cvec, aval, bvec); #elif VW == 2 diff --git a/src/kernels/level2/level2.opencl b/src/kernels/level2/level2.opencl index be979766..505231ca 100644 --- a/src/kernels/level2/level2.opencl +++ b/src/kernels/level2/level2.opencl @@ -33,9 +33,9 @@ R"( // ================================================================================================= // Returns an element from a vector -inline real LoadVector(const int id, const int max, - __global real* gm, const int offset, const int inc, - const int do_conjugate) { +INLINE_FUNC real LoadVector(const int id, const int max, + __global real* gm, const int offset, const int inc, + const int do_conjugate) { if (id < max) { real result = gm[id*inc + offset]; if (do_conjugate) { @@ -53,10 +53,10 @@ inline real LoadVector(const int id, const int max, } // Performs the rank-1 matrix update -inline void MatrixUpdate(const int id1, const int id2, const int max1, const int max2, - __global real* agm, const int a_offset, const int a_ld, - const real alpha, const real xvalue, const real yvalue, - const int is_upper) { +INLINE_FUNC void MatrixUpdate(const int id1, const int id2, const int max1, const int max2, + __global real* agm, const int a_offset, const int a_ld, + const real alpha, const real xvalue, const real yvalue, + const int is_upper) { // Bounds of a regular matrix if (id1 < max1 && id2 < max2) { @@ -100,11 +100,11 @@ inline void MatrixUpdate(const int id1, const int id2, const int max1, const int } // Performs the rank-2 matrix update -inline void MatrixUpdate2(const int id1, const int id2, const int max1, const int max2, - __global real* agm, const int a_offset, const int a_ld, - const real alpha1, const real xvalue, const real yvalue, - const real alpha2, const real xtvalue, const real ytvalue, - const int is_upper) { +INLINE_FUNC void MatrixUpdate2(const int id1, const int id2, const int max1, const int max2, + __global real* agm, const int a_offset, const int a_ld, + const real alpha1, const real xvalue, const real yvalue, + const real alpha2, const real xtvalue, const real ytvalue, + const int is_upper) { // Bounds of a regular matrix if (id1 < max1 && id2 < max2) { diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl index ff011acd..ea0478f0 100644 --- a/src/kernels/level2/xgemv.opencl +++ b/src/kernels/level2/xgemv.opencl @@ -36,9 +36,9 @@ R"( // ================================================================================================= // Defines how to load the input matrix in the non-vectorized case -inline real LoadMatrixA(const __global real* restrict agm, const int x, const int y, - const int a_ld, const int a_offset, const int parameter, - const int kl, const int ku) { +INLINE_FUNC real LoadMatrixA(const __global real* restrict agm, const int x, const int y, + const int a_ld, const int a_offset, const int parameter, + const int kl, const int ku) { real result; // For banded matrices diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl index 02a1f956..8a08f076 100644 --- a/src/kernels/level2/xgemv_fast.opencl +++ b/src/kernels/level2/xgemv_fast.opencl @@ -75,8 +75,8 @@ R"( // ================================================================================================= // Loads a vector input value -inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y, - const int a_ld) { +INLINE_FUNC realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y, + const int a_ld) { return agm[a_ld*y + x]; } diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl index 93b89187..6eeadbd1 100644 --- a/src/kernels/level3/copy_pad.opencl +++ b/src/kernels/level3/copy_pad.opencl @@ -24,14 +24,14 @@ R"( // Copies a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld // value and offset can be different. -inline void _CopyPadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real alpha, - const int do_conjugate) { +INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int do_conjugate) { // Loops over the work per thread in both dimensions #pragma unroll @@ -79,15 +79,15 @@ void CopyPadMatrix(const int src_one, const int src_two, // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but // writes only the actual data back to the destination matrix. Again, the ld value and offset can // be different. -inline void _CopyMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { +INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl index 874c1510..93241700 100644 --- a/src/kernels/level3/invert_diagonal_blocks.opencl +++ b/src/kernels/level3/invert_diagonal_blocks.opencl @@ -164,10 +164,10 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // ================================================================================================= // Triple matrix-multiplication kernel: C = A * B -inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n, - __global const real* agm, __global const real* bgm, __global real* cgm, - const int lda, const int ldb, const int ldc, - int current_size, int num_pages, const int block_size) { +INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n, + __global const real* agm, __global const real* bgm, __global real* cgm, + const int lda, const int ldb, const int ldc, + int current_size, int num_pages, const int block_size) { // Emulates a 3D grid: NX * (NY * num_pages) const int by = get_group_id(1) / num_pages; @@ -250,9 +250,9 @@ inline void TripleMatMul(const int size, const bool upper, const int part, __loc // ================================================================================================= // Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower) -inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n, - __global const real* src, const int a_offset, const int lda, - __global real* dest, int current_size, int num_pages, const int block_size) { +INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n, + __global const real* src, const int a_offset, const int lda, + __global real* dest, int current_size, int num_pages, const int block_size) { // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; @@ -286,8 +286,8 @@ inline void TripleMatMulPart1(const int size, const bool upper, __local real* bl } // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower) -inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n, - __global real* dest, int current_size, int num_pages, const int block_size) { +INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n, + __global real* dest, int current_size, int num_pages, const int block_size) { // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index fb60ce75..49c5b9a3 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -24,15 +24,15 @@ R"( // Transposes a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the transposed source matrix dimensions. -inline void _TransposePadMatrix(__local real* tile, - const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real alpha, - const int do_conjugate) { +INLINE_FUNC void _TransposePadMatrix(__local real* tile, + const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int do_conjugate) { // Loop over the work per thread #pragma unroll @@ -105,16 +105,16 @@ void TransposePadMatrix(const int src_one, const int src_two, // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a // padded source matrix, but only the actual data is written back to the transposed destination // matrix. This kernel optionally checks for upper/lower triangular matrices. -inline void _TransposeMatrix(__local real* tile, - const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { +INLINE_FUNC void _TransposeMatrix(__local real* tile, + const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { // Loop over the work per thread #pragma unroll diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl index a8bd450e..8b650589 100644 --- a/src/kernels/level3/xgemm_direct_part1.opencl +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -93,7 +93,7 @@ R"( // ================================================================================================= // Initializes the accumulation registers to zero -inline void InitAccRegistersDirect(real cpm[NWID][MWID]) { +INLINE_FUNC void InitAccRegistersDirect(real cpm[NWID][MWID]) { #pragma unroll for (int mi=0; mi source) { // Adds the name of the routine as a define source_string += "#define ROUTINE_"+routine_name_+"\n"; - // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve + // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on + // which it is known to work with all OpenCL platforms. + if (device_.IsNVIDIA() || device_.IsARM()) { + source_string += "#define USE_INLINE_KEYWORD 1\n"; + } + + // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve // performance, but might result in a reduced accuracy. if (device_.IsAMD() && device_.IsGPU()) { source_string += "#define USE_CL_MAD 1\n"; -- cgit v1.2.3