// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This is part 2 of 3 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, LOCAL_PTR real* alm, const int a_ld, const int a_offset, const int kwg, const int a_transpose, const int a_conjugate) { #if MDIMCD == MDIMAD const int la0 = get_local_id(0); const int la1 = get_local_id(1); #else const int tid = get_local_id(0) + MDIMCD*get_local_id(1); const int la0 = tid % MDIMAD; const int la1 = tid / MDIMAD; #endif #pragma unroll for (int _mia = 0; _mia < MWAD/VWMD; _mia += 1) { #pragma unroll for (int _kia = 0; _kia < KWAD; _kia += 1) { // Computes the indices for the global memory int mg = _mia + la0*(MWAD/VWMD); int kg = _kia + la1*KWAD; int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(WGD/VWMD); int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg; // Loads the data from global memory into the local memory const realMD avec = agm[idk*(a_ld/VWMD) + idm + (a_offset/VWMD)]; #if VWMD == 1 alm[kg*(WGD + PADA) + mg] = avec; #elif VWMD == 2 alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x; alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y; #elif VWMD == 4 alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x; alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y; alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.z; alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.w; #elif VWMD == 8 alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0; alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1; alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2; alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3; alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4; alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5; alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6; alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7; #elif VWMD == 16 alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0; alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1; alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2; alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3; alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4; alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5; alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6; alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7; alm[kg*(WGD + PADA) + mg*VWMD + 8] = avec.s8; alm[kg*(WGD + PADA) + mg*VWMD + 9] = avec.s9; alm[kg*(WGD + PADA) + mg*VWMD + 10] = avec.sA; alm[kg*(WGD + PADA) + mg*VWMD + 11] = avec.sB; alm[kg*(WGD + PADA) + mg*VWMD + 12] = avec.sC; alm[kg*(WGD + PADA) + mg*VWMD + 13] = avec.sD; alm[kg*(WGD + PADA) + mg*VWMD + 14] = avec.sE; alm[kg*(WGD + PADA) + mg*VWMD + 15] = avec.sF; #endif if (a_conjugate) { for (int vm=0; vm