From 798d32edad091b6faaa1627a7514868fc28c5fd9 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 17 Jul 2016 14:36:51 +0200 Subject: Improved the GEMM direct kernel by adding register blocking. Still not fast though --- src/kernels/level3/xgemm_direct.opencl | 190 ++++++++++++++++++++++++++++----- 1 file changed, 161 insertions(+), 29 deletions(-) (limited to 'src/kernels') diff --git a/src/kernels/level3/xgemm_direct.opencl b/src/kernels/level3/xgemm_direct.opencl index 9d2a55c8..a5e8ca3d 100644 --- a/src/kernels/level3/xgemm_direct.opencl +++ b/src/kernels/level3/xgemm_direct.opencl @@ -18,48 +18,180 @@ R"( // ================================================================================================= -// Main entry point of the kernel. This is the direct version. -__attribute__((reqd_work_group_size(16, 16, 1))) +// Initializes the accumulation registers to zero +inline void InitAccRegistersDirect(real cpm[NWI][MWI]) { + #pragma unroll + for (int mi=0; mi