From 02c0d640375591b46f9838489e4beb9936508888 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 9 Dec 2017 14:53:10 +0100 Subject: Modified the direct GEMM kernel to support array-to-register promotion --- src/kernels/level3/xgemm_direct_part1.opencl | 114 +++++++++------------------ src/kernels/level3/xgemm_direct_part3.opencl | 89 +++++++++++++++++---- 2 files changed, 111 insertions(+), 92 deletions(-) (limited to 'src') diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl index 80d877cc..38aa31fb 100644 --- a/src/kernels/level3/xgemm_direct_part1.opencl +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -92,117 +92,79 @@ R"( // ================================================================================================= -// Initializes the accumulation registers to zero -INLINE_FUNC void InitAccRegistersDirect(real cpd[NWID * MWID]) { - #pragma unroll - for (int _mi = 0; _mi < MWID; _mi += 1) { - #pragma unroll - for (int _ni = 0; _ni < NWID; _ni += 1) { - SetToZero(cpd[_ni * MWID + _mi]); - } - } -} - -// ================================================================================================= - -// Performs the actual computation: Cpm += Apm * Bpm -INLINE_FUNC void MultiplyAccumulateDirect(real cpd[NWID * MWID], real apd[MWID], real bpd[NWID]) { - #pragma unroll - for (int _ni = 0; _ni < NWID; _ni += 1) { - #pragma unroll - for (int _mi = 0; _mi < MWID; _mi += 1) { - MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); - } - } -} - -// ================================================================================================= - // Loads global off-chip memory into thread-private register files. This function is specific for // loading the A input matrix. -INLINE_FUNC void GlobalToPrivateDirectA(const __global real* restrict agms, real apd[MWID], +INLINE_FUNC real GlobalToPrivateDirectA(const __global real* restrict agms, const int _mi, const int a_ld, const int a_offset, const int idm, const int idk, const int a_transpose, const int a_conjugate) { - #pragma unroll - for (int _mi = 0; _mi < MWID; _mi += 1) { - const int a_index = (a_transpose) ? (idm + _mi)*a_ld + idk : idk*a_ld + (idm + _mi); - apd[_mi] = agms[a_index + a_offset]; - if (a_conjugate) { COMPLEX_CONJUGATE(apd[_mi]); } - } + const int a_index = (a_transpose) ? (idm + _mi)*a_ld + idk : idk*a_ld + (idm + _mi); + real result = agms[a_index + a_offset]; + if (a_conjugate) { COMPLEX_CONJUGATE(result); } + return result; } // Same as above, but now for the B input matrix -INLINE_FUNC void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpd[NWID], +INLINE_FUNC real GlobalToPrivateDirectB(const __global real* restrict bgms, const int _ni, const int b_ld, const int b_offset, const int idn, const int idk, const int b_transpose, const int b_conjugate) { - #pragma unroll - for (int _ni = 0; _ni < NWID; _ni += 1) { - const int b_index = (b_transpose) ? (idn + _ni)*b_ld + idk : idk*b_ld + (idn + _ni); - bpd[_ni] = bgms[b_index + b_offset]; - if (b_conjugate) { COMPLEX_CONJUGATE(bpd[_ni]); } - } + const int b_index = (b_transpose) ? (idn + _ni)*b_ld + idk : idk*b_ld + (idn + _ni); + real result = bgms[b_index + b_offset]; + if (b_conjugate) { COMPLEX_CONJUGATE(result); } + return result; } // Loads global off-chip memory into thread-private register files. This function is specific for // loading the A input matrix. This is the same as above but now includes a bounds check. -INLINE_FUNC void GlobalToPrivateCheckedA(const __global real* restrict agms, real apd[MWID], +INLINE_FUNC real GlobalToPrivateCheckedA(const __global real* restrict agms, const int _mi, const int a_ld, const int a_offset, const int idm, const int idk, const int a_transpose, const int a_conjugate, const int kSizeM) { - #pragma unroll - for (int _mi = 0; _mi < MWID; _mi += 1) { - if (idm + _mi < kSizeM) { - const int a_index = (a_transpose) ? (idm + _mi)*a_ld + idk : idk*a_ld + (idm + _mi); - apd[_mi] = agms[a_index + a_offset]; - if (a_conjugate) { COMPLEX_CONJUGATE(apd[_mi]); } - } - else { - SetToZero(apd[_mi]); - } + real result; + if (idm + _mi < kSizeM) { + const int a_index = (a_transpose) ? (idm + _mi)*a_ld + idk : idk*a_ld + (idm + _mi); + result = agms[a_index + a_offset]; + if (a_conjugate) { COMPLEX_CONJUGATE(result); } } + else { + SetToZero(result); + } + return result; } // Same as above, but now for the B input matrix -INLINE_FUNC void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpd[NWID], +INLINE_FUNC real GlobalToPrivateCheckedB(const __global real* restrict bgms, const int _ni, const int b_ld, const int b_offset, const int idn, const int idk, const int b_transpose, const int b_conjugate, const int kSizeN) { - #pragma unroll - for (int _ni = 0; _ni < NWID; _ni += 1) { - if (idn + _ni < kSizeN) { - const int b_index = (b_transpose) ? (idn + _ni)*b_ld + idk : idk*b_ld + (idn + _ni); - bpd[_ni] = bgms[b_index + b_offset]; - if (b_conjugate) { COMPLEX_CONJUGATE(bpd[_ni]); } - } - else { - SetToZero(bpd[_ni]); - } + real result; + if (idn + _ni < kSizeN) { + const int b_index = (b_transpose) ? (idn + _ni)*b_ld + idk : idk*b_ld + (idn + _ni); + result = bgms[b_index + b_offset]; + if (b_conjugate) { COMPLEX_CONJUGATE(result); } + } + else { + SetToZero(result); } + return result; } // ================================================================================================= // Caches on-chip local memory into per-thread private memory (registers). This function is specific // for caching the A input matrix. -INLINE_FUNC void LocalToPrivateDirectA(LOCAL_PTR real* alm, real apd[MWID], const int kg, +INLINE_FUNC real LocalToPrivateDirectA(LOCAL_PTR real* alm, const int _mi, const int kg, const int a_transpose) { - #pragma unroll - for (int _mi = 0; _mi < MWID; _mi += 1) { - const int mg = _mi + get_local_id(0)*MWID; - const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg; - apd[_mi] = alm[index]; - } + const int mg = _mi + get_local_id(0)*MWID; + const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg; + return alm[index]; } // Same as above, but now for the B input matrix -INLINE_FUNC void LocalToPrivateDirectB(LOCAL_PTR real* blm, real bpd[NWID], const int kg, +INLINE_FUNC real LocalToPrivateDirectB(LOCAL_PTR real* blm, const int _ni, const int kg, const int b_transpose) { - #pragma unroll - for (int _ni = 0; _ni < NWID; _ni += 1) { - const int ng = _ni + get_local_id(1)*NWID; - const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng; - bpd[_ni] = blm[index]; - } + const int ng = _ni + get_local_id(1)*NWID; + const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng; + return blm[index]; } // ================================================================================================= diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl index f9af7a41..e1532e98 100644 --- a/src/kernels/level3/xgemm_direct_part3.opencl +++ b/src/kernels/level3/xgemm_direct_part3.opencl @@ -35,12 +35,21 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize const __global real* restrict bgms = (const __global real* restrict) bgm; // Allocates workitem-private memory (registers) + #pragma promote_to_registers real apd[MWID]; + #pragma promote_to_registers real bpd[NWID]; + #pragma promote_to_registers real cpd[NWID * MWID]; // Initializes the accumulation registers - InitAccRegistersDirect(cpd); + #pragma unroll + for (int _mi = 0; _mi < MWID; _mi += 1) { + #pragma unroll + for (int _ni = 0; _ni < NWID; _ni += 1) { + SetToZero(cpd[_ni * MWID + _mi]); + } + } // The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section // processes only the main parts: output blocks of WGD by WGD. @@ -74,11 +83,23 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize int kg = pwi + _pit; // Loads data: local --> private (matrix A and B) - LocalToPrivateDirectA(alm, apd, kg, a_transpose); - LocalToPrivateDirectB(blm, bpd, kg, b_transpose); + #pragma unroll + for (int _mi = 0; _mi < MWID; _mi += 1) { + apd[_mi] = LocalToPrivateDirectA(alm, _mi, kg, a_transpose); + } + #pragma unroll + for (int _ni = 0; _ni < NWID; _ni += 1) { + bpd[_ni] = LocalToPrivateDirectB(blm, _ni, kg, b_transpose); + } // Performs the accumulation (Cpmd += Apmd * Bpmd) - MultiplyAccumulateDirect(cpd, apd, bpd); + #pragma unroll + for (int _ni = 0; _ni < NWID; _ni += 1) { + #pragma unroll + for (int _mi = 0; _mi < MWID; _mi += 1) { + MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); + } + } } } barrier(CLK_LOCAL_MEM_FENCE); @@ -88,11 +109,23 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize for (; kwg < kSizeK; ++kwg) { // Loads data: off-chip --> private (matrix A and B) - GlobalToPrivateDirectA(agms, apd, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate); - GlobalToPrivateDirectB(bgms, bpd, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate); + #pragma unroll + for (int _mi = 0; _mi < MWID; _mi += 1) { + apd[_mi] = GlobalToPrivateDirectA(agms, _mi, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate); + } + #pragma unroll + for (int _ni = 0; _ni < NWID; _ni += 1) { + bpd[_ni] = GlobalToPrivateDirectB(bgms, _ni, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate); + } // Performs the accumulation (Cpmd += Apmd * Bpmd) - MultiplyAccumulateDirect(cpd, apd, bpd); + #pragma unroll + for (int _ni = 0; _ni < NWID; _ni += 1) { + #pragma unroll + for (int _mi = 0; _mi < MWID; _mi += 1) { + MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); + } + } } // Stores a tile of results and performs the multiplication with alpha and beta @@ -118,11 +151,23 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize int kg = pwi + _pit; // Loads data: local --> private (matrix A and B) - LocalToPrivateDirectA(alm, apd, kg, a_transpose); - LocalToPrivateDirectB(blm, bpd, kg, b_transpose); - - // Performs the accumulation (Cpmd += Apmd * Bpmd) - MultiplyAccumulateDirect(cpd, apd, bpd); + #pragma unroll + for (int _mi = 0; _mi < MWID; _mi += 1) { + apd[_mi] = LocalToPrivateDirectA(alm, _mi, kg, a_transpose); + } + #pragma unroll + for (int _ni = 0; _ni < NWID; _ni += 1) { + bpd[_ni] = LocalToPrivateDirectB(blm, _ni, kg, b_transpose); + } + + // Performs the accumulation (C += A * B) + #pragma unroll + for (int _ni = 0; _ni < NWID; _ni += 1) { + #pragma unroll + for (int _mi = 0; _mi < MWID; _mi += 1) { + MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); + } + } } } barrier(CLK_LOCAL_MEM_FENCE); @@ -132,11 +177,23 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize for (; kwg < kSizeK; ++kwg) { // Loads data: off-chip --> private (matrix A and B) - GlobalToPrivateCheckedA(agms, apd, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM); - GlobalToPrivateCheckedB(bgms, bpd, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN); + #pragma unroll + for (int _mi = 0; _mi < MWID; _mi += 1) { + apd[_mi] = GlobalToPrivateCheckedA(agms, _mi, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM); + } + #pragma unroll + for (int _ni = 0; _ni < NWID; _ni += 1) { + bpd[_ni] = GlobalToPrivateCheckedB(bgms, _ni, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN); + } - // Performs the accumulation (Cpmd += Apmd * Bpmd) - MultiplyAccumulateDirect(cpd, apd, bpd); + // Performs the accumulation (C += A * B) + #pragma unroll + for (int _ni = 0; _ni < NWID; _ni += 1) { + #pragma unroll + for (int _mi = 0; _mi < MWID; _mi += 1) { + MultiplyAdd(cpd[_ni * MWID + _mi], apd[_mi], bpd[_ni]); + } + } } // Stores a tile of results and performs the multiplication with alpha and beta -- cgit v1.2.3