From c1c4bc5d209280e4ec9be5c0a26f7c94077a6b20 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 3 Oct 2016 19:32:01 +0200 Subject: Re-organised GEMM direct kernel and added faster fall-back version for incomplete rectangles --- src/kernels/level3/xgemm_direct_part2.opencl | 432 ++++++++++++++------------- 1 file changed, 219 insertions(+), 213 deletions(-) (limited to 'src/kernels/level3/xgemm_direct_part2.opencl') diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl index 0d066186..953a3b3c 100644 --- a/src/kernels/level3/xgemm_direct_part2.opencl +++ b/src/kernels/level3/xgemm_direct_part2.opencl @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This is part 2 of 2 of the GEMM kernel. See part 1 for more information. +// This is part 2 of 3 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= @@ -17,179 +17,222 @@ R"( // ================================================================================================= -// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication -// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm -inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], - const int kSizeM, const int kSizeN, - const real alpha, const real beta, - const int c_ld, const int c_offset, const int c_transpose) { +// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for +// caching the A input matrix. +inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, + const int a_ld, const int a_offset, const int kwg, + const int a_transpose, const int a_conjugate) { + #if MDIMCD == MDIMAD + const int la0 = get_local_id(0); + const int la1 = get_local_id(1); + #else + const int tid = get_local_id(0) + MDIMCD*get_local_id(1); + const int la0 = tid % MDIMAD; + const int la1 = tid / MDIMAD; + #endif #pragma unroll - for (int ni=0; ni local (matrix A and B) - GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); - GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); - barrier(CLK_LOCAL_MEM_FENCE); - - // Loops over all workitem tiles, unrolled by a factor KWID - for (int pwi=0; pwi private (matrix A) - LocalToPrivateDirectA(alm, apm, kg, a_transpose); - - // Loads data: local --> private (matrix B) - LocalToPrivateDirectB(blm, bpm, kg, b_transpose); - - // Performs the accumulation (Cpm += Apm * Bpm) - MultiplyAccumulateDirect(cpm, apm, bpm); +// Same as above, but now for the B input matrix +inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm, + const int b_ld, const int b_offset, const int kwg, + const int b_transpose, const int b_conjugate) { + #if MDIMCD == NDIMBD + const int lb0 = get_local_id(0); + const int lb1 = get_local_id(1); + #else + const int tid = get_local_id(0) + MDIMCD*get_local_id(1); + const int lb0 = tid % NDIMBD; + const int lb1 = tid / NDIMBD; + #endif + #pragma unroll + for (int kib=0; kib