// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This is part 2 of 3 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, const int a_ld, const int a_offset, const int kwg, const int a_transpose, const int a_conjugate) { #if MDIMCD == MDIMAD const int la0 = get_local_id(0); const int la1 = get_local_id(1); #else const int tid = get_local_id(0) + MDIMCD*get_local_id(1); const int la0 = tid % MDIMAD; const int la1 = tid / MDIMAD; #endif #pragma unroll for (int mia=0; mia