From 61f489e370c56075e166caff6d1ad671ca6787b9 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 2 Oct 2016 15:06:59 +0200 Subject: Split the GEMM direct kernel into two files; set the default tuning target to 256-256-256 --- src/kernels/level3/xgemm_direct_part2.opencl | 207 +++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 src/kernels/level3/xgemm_direct_part2.opencl (limited to 'src/kernels/level3/xgemm_direct_part2.opencl') diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl new file mode 100644 index 00000000..36804f4e --- /dev/null +++ b/src/kernels/level3/xgemm_direct_part2.opencl @@ -0,0 +1,207 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This is part 2 of 2 of the GEMM kernel. See part 1 for more information. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication +// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm +inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], + const int kSizeM, const int kSizeN, + const real alpha, const real beta, + const int c_ld, const int c_offset, const int c_transpose) { + #pragma unroll + for (int ni=0; ni local (matrix A and B) + GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); + GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); + barrier(CLK_LOCAL_MEM_FENCE); + + // Loops over all workitem tiles, unrolled by a factor KWID + for (int pwi=0; pwi private (matrix A) + LocalToPrivateDirectA(alm, apm, kg, a_transpose); + + // Loads data: local --> private (matrix B) + LocalToPrivateDirectB(blm, bpm, kg, b_transpose); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Loop over the remaining part (incomplete tile in K-dimension) + for (; kwg < kSizeK; ++kwg) { + const int idk = kwg; + + // Loads A into register memory + #pragma unroll + for (int mi=0; mi