From ca0c075de2a73f250046876b0ca5f90dc4ef0e77 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 3 Oct 2016 20:09:15 +0200 Subject: Added functions to load from off-chip to local memory without vector loads for the GEMM direct kernels --- src/kernels/level3/xgemm_direct_part1.opencl | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'src/kernels/level3/xgemm_direct_part1.opencl') diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl index 2e5addef..a8bd450e 100644 --- a/src/kernels/level3/xgemm_direct_part1.opencl +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -182,6 +182,31 @@ inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm // ================================================================================================= +// Caches on-chip local memory into per-thread private memory (registers). This function is specific +// for caching the A input matrix. +inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg, + const int a_transpose) { + #pragma unroll + for (int mi=0; mi