From 140dc12854dd9521c1420ccba7eb9fb0d50e054e Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 25 Sep 2016 11:38:35 +0200 Subject: Added a first version of the direct version of GEMM with local memory --- src/kernels/level3/xgemm_direct.opencl | 198 ++++++++++++++++++++++++++++++++- 1 file changed, 194 insertions(+), 4 deletions(-) (limited to 'src/kernels') diff --git a/src/kernels/level3/xgemm_direct.opencl b/src/kernels/level3/xgemm_direct.opencl index a5e8ca3d..fb5972ba 100644 --- a/src/kernels/level3/xgemm_direct.opencl +++ b/src/kernels/level3/xgemm_direct.opencl @@ -18,6 +18,164 @@ R"( // ================================================================================================= +// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for +// caching the A input matrix. +inline void GlobalToLocalDirectA(const __global realM* restrict agm, __local real* alm, + const int a_ld, const int a_offset, const int tid, const int kwg, + const int a_transpose, const int a_conjugate) { + const int la0 = tid % MDIMA; + const int la1 = tid / MDIMA; + #pragma unroll + for (int mia=0; mia local (matrix A and B) + GlobalToLocalDirectA(agm, alm, a_ld, a_offset, tid, kwg, a_transpose, a_conjugate); + GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, tid, kwg, b_transpose, b_conjugate); + barrier(CLK_LOCAL_MEM_FENCE); + + // Loops over all workitem tiles, unrolled by a factor KWI + for (int pwi=0; pwi private (matrix A) + LocalToPrivateDirectA(alm, apm, kg, a_transpose); + + // Loads data: local --> private (matrix B) + LocalToPrivateDirectB(blm, bpm, kg, b_transpose); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } // Loop over the remaining part (incomplete tile in K-dimension) for (; kwg < kSizeK; ++kwg) { -- cgit v1.2.3