diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-07-08 17:12:16 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-07-08 17:12:16 +0200 |
commit | 442c31dd508c573023594a803160ddb69d4929f2 (patch) | |
tree | 55474d09086481117204626b27cbec4ee465be9a /src/kernels/level3 | |
parent | 75c0e861b842dbd08def5e55696fd79d713afc96 (diff) |
Made the inline keyword in kernels optional currently only enabled for NVIDIA and ARM GPUs
Diffstat (limited to 'src/kernels/level3')
-rw-r--r-- | src/kernels/level3/copy_pad.opencl | 34 | ||||
-rw-r--r-- | src/kernels/level3/invert_diagonal_blocks.opencl | 18 | ||||
-rw-r--r-- | src/kernels/level3/transpose_pad.opencl | 38 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_direct_part1.opencl | 56 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_direct_part2.opencl | 40 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_direct_part3.opencl | 18 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part1.opencl | 22 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part2.opencl | 8 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part3.opencl | 22 |
9 files changed, 128 insertions, 128 deletions
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl index 93b89187..6eeadbd1 100644 --- a/src/kernels/level3/copy_pad.opencl +++ b/src/kernels/level3/copy_pad.opencl @@ -24,14 +24,14 @@ R"( // Copies a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld // value and offset can be different. -inline void _CopyPadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real alpha, - const int do_conjugate) { +INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int do_conjugate) { // Loops over the work per thread in both dimensions #pragma unroll @@ -79,15 +79,15 @@ void CopyPadMatrix(const int src_one, const int src_two, // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but // writes only the actual data back to the destination matrix. Again, the ld value and offset can // be different. -inline void _CopyMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { +INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl index 874c1510..93241700 100644 --- a/src/kernels/level3/invert_diagonal_blocks.opencl +++ b/src/kernels/level3/invert_diagonal_blocks.opencl @@ -164,10 +164,10 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // ================================================================================================= // Triple matrix-multiplication kernel: C = A * B -inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n, - __global const real* agm, __global const real* bgm, __global real* cgm, - const int lda, const int ldb, const int ldc, - int current_size, int num_pages, const int block_size) { +INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n, + __global const real* agm, __global const real* bgm, __global real* cgm, + const int lda, const int ldb, const int ldc, + int current_size, int num_pages, const int block_size) { // Emulates a 3D grid: NX * (NY * num_pages) const int by = get_group_id(1) / num_pages; @@ -250,9 +250,9 @@ inline void TripleMatMul(const int size, const bool upper, const int part, __loc // ================================================================================================= // Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower) -inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n, - __global const real* src, const int a_offset, const int lda, - __global real* dest, int current_size, int num_pages, const int block_size) { +INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n, + __global const real* src, const int a_offset, const int lda, + __global real* dest, int current_size, int num_pages, const int block_size) { // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; @@ -286,8 +286,8 @@ inline void TripleMatMulPart1(const int size, const bool upper, __local real* bl } // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower) -inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n, - __global real* dest, int current_size, int num_pages, const int block_size) { +INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n, + __global real* dest, int current_size, int num_pages, const int block_size) { // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index fb60ce75..49c5b9a3 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -24,15 +24,15 @@ R"( // Transposes a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the transposed source matrix dimensions. -inline void _TransposePadMatrix(__local real* tile, - const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real alpha, - const int do_conjugate) { +INLINE_FUNC void _TransposePadMatrix(__local real* tile, + const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int do_conjugate) { // Loop over the work per thread #pragma unroll @@ -105,16 +105,16 @@ void TransposePadMatrix(const int src_one, const int src_two, // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a // padded source matrix, but only the actual data is written back to the transposed destination // matrix. This kernel optionally checks for upper/lower triangular matrices. -inline void _TransposeMatrix(__local real* tile, - const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { +INLINE_FUNC void _TransposeMatrix(__local real* tile, + const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { // Loop over the work per thread #pragma unroll diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl index a8bd450e..8b650589 100644 --- a/src/kernels/level3/xgemm_direct_part1.opencl +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -93,7 +93,7 @@ R"( // ================================================================================================= // Initializes the accumulation registers to zero -inline void InitAccRegistersDirect(real cpm[NWID][MWID]) { +INLINE_FUNC void InitAccRegistersDirect(real cpm[NWID][MWID]) { #pragma unroll for (int mi=0; mi<MWID; ++mi) { #pragma unroll @@ -106,7 +106,7 @@ inline void InitAccRegistersDirect(real cpm[NWID][MWID]) { // ================================================================================================= // Performs the actual computation: Cpm += Apm * Bpm -inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) { +INLINE_FUNC void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) { #pragma unroll for (int ni=0; ni<NWID; ++ni) { #pragma unroll @@ -120,9 +120,9 @@ inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real // Loads global off-chip memory into thread-private register files. This function is specific for // loading the A input matrix. -inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID], - const int a_ld, const int a_offset, const int idm, const int idk, - const int a_transpose, const int a_conjugate) { +INLINE_FUNC void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID], + const int a_ld, const int a_offset, const int idm, const int idk, + const int a_transpose, const int a_conjugate) { #pragma unroll for (int mi=0; mi<MWID; ++mi) { const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi); @@ -132,9 +132,9 @@ inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[ } // Same as above, but now for the B input matrix -inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID], - const int b_ld, const int b_offset, const int idn, const int idk, - const int b_transpose, const int b_conjugate) { +INLINE_FUNC void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID], + const int b_ld, const int b_offset, const int idn, const int idk, + const int b_transpose, const int b_conjugate) { #pragma unroll for (int ni=0; ni<NWID; ++ni) { const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni); @@ -145,10 +145,10 @@ inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[ // Loads global off-chip memory into thread-private register files. This function is specific for // loading the A input matrix. This is the same as above but now includes a bounds check. -inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID], - const int a_ld, const int a_offset, const int idm, const int idk, - const int a_transpose, const int a_conjugate, - const int kSizeM) { +INLINE_FUNC void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID], + const int a_ld, const int a_offset, const int idm, const int idk, + const int a_transpose, const int a_conjugate, + const int kSizeM) { #pragma unroll for (int mi=0; mi<MWID; ++mi) { if (idm + mi < kSizeM) { @@ -163,10 +163,10 @@ inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm } // Same as above, but now for the B input matrix -inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID], - const int b_ld, const int b_offset, const int idn, const int idk, - const int b_transpose, const int b_conjugate, - const int kSizeN) { +INLINE_FUNC void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID], + const int b_ld, const int b_offset, const int idn, const int idk, + const int b_transpose, const int b_conjugate, + const int kSizeN) { #pragma unroll for (int ni=0; ni<NWID; ++ni) { if (idn + ni < kSizeN) { @@ -184,8 +184,8 @@ inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm // Caches on-chip local memory into per-thread private memory (registers). This function is specific // for caching the A input matrix. -inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg, - const int a_transpose) { +INLINE_FUNC void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg, + const int a_transpose) { #pragma unroll for (int mi=0; mi<MWID; ++mi) { const int mg = mi + get_local_id(0)*MWID; @@ -195,8 +195,8 @@ inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int k } // Same as above, but now for the B input matrix -inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg, - const int b_transpose) { +INLINE_FUNC void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg, + const int b_transpose) { #pragma unroll for (int ni=0; ni<NWID; ++ni) { const int ng = ni + get_local_id(1)*NWID; @@ -209,10 +209,10 @@ inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int k // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm -inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], - const int idm, const int idn, - const real alpha, const real beta, - const int c_ld, const int c_offset, const int c_transpose) { +INLINE_FUNC void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], + const int idm, const int idn, + const real alpha, const real beta, + const int c_ld, const int c_offset, const int c_transpose) { #pragma unroll for (int ni=0; ni<NWID; ++ni) { #pragma unroll @@ -237,10 +237,10 @@ inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm -inline void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID], - const int idm, const int idn, const int kSizeM, const int kSizeN, - const real alpha, const real beta, - const int c_ld, const int c_offset, const int c_transpose) { +INLINE_FUNC void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID], + const int idm, const int idn, const int kSizeM, const int kSizeN, + const real alpha, const real beta, + const int c_ld, const int c_offset, const int c_transpose) { #pragma unroll for (int ni=0; ni<NWID; ++ni) { #pragma unroll diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl index 3af14bff..1d9330fc 100644 --- a/src/kernels/level3/xgemm_direct_part2.opencl +++ b/src/kernels/level3/xgemm_direct_part2.opencl @@ -19,9 +19,9 @@ R"( // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. -inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, - const int a_ld, const int a_offset, const int kwg, - const int a_transpose, const int a_conjugate) { +INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, + const int a_ld, const int a_offset, const int kwg, + const int a_transpose, const int a_conjugate) { #if MDIMCD == MDIMAD const int la0 = get_local_id(0); const int la1 = get_local_id(1); @@ -90,9 +90,9 @@ inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local re } // Same as above, but now for the B input matrix -inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm, - const int b_ld, const int b_offset, const int kwg, - const int b_transpose, const int b_conjugate) { +INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm, + const int b_ld, const int b_offset, const int kwg, + const int b_transpose, const int b_conjugate) { #if MDIMCD == NDIMBD const int lb0 = get_local_id(0); const int lb1 = get_local_id(1); @@ -165,9 +165,9 @@ inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local re // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. In contrast to the functions above, this function performs doesn't // use the vector data-types. -inline void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm, - const int a_ld, const int a_offset, const int kwg, - const int a_transpose, const int a_conjugate) { +INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm, + const int a_ld, const int a_offset, const int kwg, + const int a_transpose, const int a_conjugate) { #if MDIMCD == MDIMAD const int la0 = get_local_id(0); const int la1 = get_local_id(1); @@ -196,9 +196,9 @@ inline void GlobalToLocalScalarA(const __global real* restrict agms, __local rea } // Same as above, but now for the B input matrix -inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm, - const int b_ld, const int b_offset, const int kwg, - const int b_transpose, const int b_conjugate) { +INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm, + const int b_ld, const int b_offset, const int kwg, + const int b_transpose, const int b_conjugate) { #if MDIMCD == NDIMBD const int lb0 = get_local_id(0); const int lb1 = get_local_id(1); @@ -231,10 +231,10 @@ inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local rea // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. In contrast to the functions above, this function performs bounds // checks and doesn't use the vector data-types. -inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm, - const int a_ld, const int a_offset, const int kwg, - const int a_transpose, const int a_conjugate, - const int kSizeM, const int kSizeK) { +INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm, + const int a_ld, const int a_offset, const int kwg, + const int a_transpose, const int a_conjugate, + const int kSizeM, const int kSizeK) { #if MDIMCD == MDIMAD const int la0 = get_local_id(0); const int la1 = get_local_id(1); @@ -270,10 +270,10 @@ inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local re } // Same as above, but now for the B input matrix -inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm, - const int b_ld, const int b_offset, const int kwg, - const int b_transpose, const int b_conjugate, - const int kSizeN, const int kSizeK) { +INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm, + const int b_ld, const int b_offset, const int kwg, + const int b_transpose, const int b_conjugate, + const int kSizeN, const int kSizeK) { #if MDIMCD == NDIMBD const int lb0 = get_local_id(0); const int lb1 = get_local_id(1); diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl index c04cdeb8..b0beb614 100644 --- a/src/kernels/level3/xgemm_direct_part3.opencl +++ b/src/kernels/level3/xgemm_direct_part3.opencl @@ -18,15 +18,15 @@ R"( // ================================================================================================= // Main body of the kernel. This is the direct version without pre/post processing and restrictions. -inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, - const real_arg arg_alpha, - const real_arg arg_beta, - const __global realMD* restrict agm, const int a_offset, const int a_ld, - const __global realND* restrict bgm, const int b_offset, const int b_ld, - __global real* cgm, const int c_offset, const int c_ld, - __local real* alm, __local real* blm, - const int a_transpose, const int b_transpose, const int c_transpose, - const int a_conjugate, const int b_conjugate) { +INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + __local real* alm, __local real* blm, + const int a_transpose, const int b_transpose, const int c_transpose, + const int a_conjugate, const int b_conjugate) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index d0ce06ad..07dafe13 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -135,7 +135,7 @@ R"( // ================================================================================================= // Initializes the accumulation registers to zero -inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) { +INLINE_FUNC void InitAccRegisters(realM cpm[NWI][MWI/VWM]) { #pragma unroll for (int mi=0; mi<MWI/VWM; ++mi) { #pragma unroll @@ -186,8 +186,8 @@ inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) { // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. #if SA == 1 -inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm, - const int kSizeM, const int tid, const int kwg) { +INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm, + const int kSizeM, const int tid, const int kwg) { const int la0 = tid % MDIMA; const int la1 = tid / MDIMA; #pragma unroll @@ -216,8 +216,8 @@ inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* al // Same as above, but now for the B input matrix #if SB == 1 -inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm, - const int kSizeN, const int tid, const int kwg) { +INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm, + const int kSizeN, const int tid, const int kwg) { const int lb0 = tid % NDIMB; const int lb1 = tid / NDIMB; #pragma unroll @@ -249,8 +249,8 @@ inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* bl // Caches global off-chip memory directly into per-thread private memory (registers). This function // is specific for caching the A input matrix. #if SA == 0 -inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM], - const int kSizeM, const int idk, const int kwg) { +INLINE_FUNC void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM], + const int kSizeM, const int idk, const int kwg) { #pragma unroll for (int mi=0; mi<MWI/VWM; ++mi) { @@ -272,8 +272,8 @@ inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/V // Same as above, but now for the B input matrix #if SB == 0 -inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN], - const int kSizeN, const int idk) { +INLINE_FUNC void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN], + const int kSizeN, const int idk) { #pragma unroll for (int ni=0; ni<NWI/VWN; ++ni) { @@ -298,7 +298,7 @@ inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/V // Caches on-chip local memory into per-thread private memory (registers). This function is specific // for caching the A input matrix. #if SA == 1 -inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) { +INLINE_FUNC void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) { #pragma unroll for (int mi=0; mi<MWI/VWM; ++mi) { #if STRM == 0 @@ -313,7 +313,7 @@ inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg // Same as above, but now for the B input matrix #if SB == 1 -inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) { +INLINE_FUNC void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) { #pragma unroll for (int ni=0; ni<NWI/VWN; ++ni) { #if STRN == 0 diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index e8234a29..06fafc8f 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -18,7 +18,7 @@ R"( // ================================================================================================= // The vectorised multiply-add function -inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) { +INLINE_FUNC realM MultiplyAddVector(realM cvec, const realM avec, const real bval) { #if USE_VECTOR_MAD == 1 cvec += avec * bval; #else @@ -64,7 +64,7 @@ inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) { } // Performs the actual computation: Cpm += Apm * Bpm -inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) { +INLINE_FUNC void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) { #pragma unroll for (int ni=0; ni<NWI/VWN; ++ni) { #pragma unroll @@ -115,8 +115,8 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm -inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM, - const real alpha, const real beta) { +INLINE_FUNC void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM, + const real alpha, const real beta) { #pragma unroll for (int ni=0; ni<NWI; ++ni) { #pragma unroll diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index 8ac3a3a8..3f0d590d 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -18,17 +18,17 @@ R"( // ================================================================================================= // Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above. -inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, - const __global realM* restrict agm, const __global realN* restrict bgm, - __global realM* cgm, realM cpm[NWI][MWI/VWM] - #if SA == 1 && SB == 1 - , __local realM* alm, __local realN* blm - #elif SA == 1 - , __local realM* alm - #elif SB == 1 - , __local realN* blm - #endif - ) { +INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, + const __global realM* restrict agm, const __global realN* restrict bgm, + __global realM* cgm, realM cpm[NWI][MWI/VWM] + #if SA == 1 && SB == 1 + , __local realM* alm, __local realN* blm + #elif SA == 1 + , __local realM* alm + #elif SB == 1 + , __local realN* blm + #endif + ) { // Allocates workitem-private memory (registers) realM apm[MWI/VWM]; |