diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-12-09 14:09:13 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-12-09 14:09:13 +0100 |
commit | 23e3a85f2c328d4a23db2fca5d1d89d78163711f (patch) | |
tree | 02b8dd5364d958184c45c9bfdb2c28e38d72b24e /src/kernels/level3/xgemm_part1.opencl | |
parent | d9df62b7942bb8af5fd385b8545aceb1d8b578f3 (diff) |
Reformatted GEMM kernel to support array-to-register promotion
Diffstat (limited to 'src/kernels/level3/xgemm_part1.opencl')
-rw-r--r-- | src/kernels/level3/xgemm_part1.opencl | 182 |
1 files changed, 82 insertions, 100 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 88744668..053eb721 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -135,50 +135,46 @@ R"( // ================================================================================================= // Initializes the accumulation registers to zero -INLINE_FUNC void InitAccRegisters(realM cpm[NWI*MWI/VWM]) { - #pragma unroll - for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { - #pragma unroll - for (int _ni = 0; _ni < NWI; _ni += 1) { - #if VWM == 1 - SetToZero(cpm[_ni * (MWI/VWM) + _mi]); - #elif VWM == 2 - SetToZero(cpm[_ni * (MWI/VWM) + _mi].x); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].y); - #elif VWM == 4 - SetToZero(cpm[_ni * (MWI/VWM) + _mi].x); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].y); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].z); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].w); - #elif VWM == 8 - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s0); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s1); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s2); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s3); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s4); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s5); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s6); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s7); - #elif VWM == 16 - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s0); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s1); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s2); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s3); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s4); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s5); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s6); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s7); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s8); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].s9); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].sA); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].sB); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].sC); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].sD); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].sE); - SetToZero(cpm[_ni * (MWI/VWM) + _mi].sF); - #endif - } - } +INLINE_FUNC realM InitAccRegisters() { + realM result; + #if VWM == 1 + SetToZero(result); + #elif VWM == 2 + SetToZero(result.x); + SetToZero(result.y); + #elif VWM == 4 + SetToZero(result.x); + SetToZero(result.y); + SetToZero(result.z); + SetToZero(result.w); + #elif VWM == 8 + SetToZero(result.s0); + SetToZero(result.s1); + SetToZero(result.s2); + SetToZero(result.s3); + SetToZero(result.s4); + SetToZero(result.s5); + SetToZero(result.s6); + SetToZero(result.s7); + #elif VWM == 16 + SetToZero(result.s0); + SetToZero(result.s1); + SetToZero(result.s2); + SetToZero(result.s3); + SetToZero(result.s4); + SetToZero(result.s5); + SetToZero(result.s6); + SetToZero(result.s7); + SetToZero(result.s8); + SetToZero(result.s9); + SetToZero(result.sA); + SetToZero(result.sB); + SetToZero(result.sC); + SetToZero(result.sD); + SetToZero(result.sE); + SetToZero(result.sF); + #endif + return result; } // ================================================================================================= @@ -249,47 +245,39 @@ INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, LOCAL_PTR re // Caches global off-chip memory directly into per-thread private memory (registers). This function // is specific for caching the A input matrix. #if SA == 0 -INLINE_FUNC void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM], - const int kSizeM, const int idk, const int kwg) { - #pragma unroll - for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { - - // Computes the indices based on strided/non-strided access - #if STRM == 0 - int mg = _mi + get_local_id(0)*(MWI/VWM); - #elif STRM == 1 - int mg = get_local_id(0) + _mi*MDIMC; - #endif - - // Computes the indices for the global memory - int idm = mg + GetGroupID0() * (MWG/VWM); - - // Loads the data from global memory (not transposed) and stores into registers - apm[_mi] = agm[idk*(kSizeM/VWM) + idm]; - } +INLINE_FUNC realM GlobalToPrivateA(const __global realM* restrict agm, const int _mi, + const int kSizeM, const int idk, const int kwg) { + // Computes the indices based on strided/non-strided access + #if STRM == 0 + int mg = _mi + get_local_id(0)*(MWI/VWM); + #elif STRM == 1 + int mg = get_local_id(0) + _mi*MDIMC; + #endif + + // Computes the indices for the global memory + int idm = mg + GetGroupID0() * (MWG/VWM); + + // Loads the data from global memory (not transposed) and stores into registers + return agm[idk*(kSizeM/VWM) + idm]; } #endif // Same as above, but now for the B input matrix #if SB == 0 -INLINE_FUNC void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN], - const int kSizeN, const int idk) { - #pragma unroll - for (int _ni = 0; _ni < NWI/VWN; _ni += 1) { - - // Computes the indices based on strided/non-strided access - #if STRN == 0 - int ng = _ni + get_local_id(1)*(NWI/VWN); - #elif STRN == 1 - int ng = get_local_id(1) + _ni*NDIMC; - #endif - - // Computes the indices for the global memory - int idn = ng + GetGroupID1() * (NWG/VWN); - - // Loads the data from global memory (transposed) and stores into registers - bpm[_ni] = bgm[idk*(kSizeN/VWN) + idn]; - } +INLINE_FUNC realN GlobalToPrivateB(const __global realN* restrict bgm, const int _ni, + const int kSizeN, const int idk) { + // Computes the indices based on strided/non-strided access + #if STRN == 0 + int ng = _ni + get_local_id(1)*(NWI/VWN); + #elif STRN == 1 + int ng = get_local_id(1) + _ni*NDIMC; + #endif + + // Computes the indices for the global memory + int idn = ng + GetGroupID1() * (NWG/VWN); + + // Loads the data from global memory (transposed) and stores into registers + return bgm[idk*(kSizeN/VWN) + idn]; } #endif @@ -298,31 +286,25 @@ INLINE_FUNC void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[ // Caches on-chip local memory into per-thread private memory (registers). This function is specific // for caching the A input matrix. #if SA == 1 -INLINE_FUNC void LocalToPrivateA(LOCAL_PTR realM* alm, realM apm[MWI/VWM], const int kg) { - #pragma unroll - for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { - #if STRM == 0 - int mg = _mi + get_local_id(0)*(MWI/VWM); - #elif STRM == 1 - int mg = get_local_id(0) + _mi*MDIMC; - #endif - apm[_mi] = alm[kg*(MWG/VWM) + mg]; - } +INLINE_FUNC realM LocalToPrivateA(LOCAL_PTR realM* alm, const int _mi, const int kg) { + #if STRM == 0 + int mg = _mi + get_local_id(0)*(MWI/VWM); + #elif STRM == 1 + int mg = get_local_id(0) + _mi*MDIMC; + #endif + return alm[kg*(MWG/VWM) + mg]; } #endif // Same as above, but now for the B input matrix #if SB == 1 -INLINE_FUNC void LocalToPrivateB(LOCAL_PTR realN* blm, realN bpm[NWI/VWN], const int kg) { - #pragma unroll - for (int _ni = 0; _ni < NWI/VWN; _ni += 1) { - #if STRN == 0 - int ng = _ni + get_local_id(1)*(NWI/VWN); - #elif STRN == 1 - int ng = get_local_id(1) + _ni*NDIMC; - #endif - bpm[_ni] = blm[kg*(NWG/VWN) + ng]; - } +INLINE_FUNC realN LocalToPrivateB(LOCAL_PTR realN* blm, const int _ni, const int kg) { + #if STRN == 0 + int ng = _ni + get_local_id(1)*(NWI/VWN); + #elif STRN == 1 + int ng = get_local_id(1) + _ni*NDIMC; + #endif + return blm[kg*(NWG/VWN) + ng]; } #endif |