diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-12-03 16:40:36 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-12-03 16:40:36 +0100 |
commit | cf4555d1f44aea9c82b60211b5650b6b77a1226c (patch) | |
tree | 459676fcf89b85aaab7b014d935d1f5b3ab984fc /src/kernels/level3/xgemm_direct_part1.opencl | |
parent | 0a1a3de58a410f61f3b990537541a633826ea640 (diff) |
Added GEMM (direct and in-direct) to the pre-processor testing; modified the loops in kernel accordingly
Diffstat (limited to 'src/kernels/level3/xgemm_direct_part1.opencl')
-rw-r--r-- | src/kernels/level3/xgemm_direct_part1.opencl | 90 |
1 files changed, 45 insertions, 45 deletions
diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl index 7d185224..e2f9c6a8 100644 --- a/src/kernels/level3/xgemm_direct_part1.opencl +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -95,10 +95,10 @@ R"( // Initializes the accumulation registers to zero INLINE_FUNC void InitAccRegistersDirect(real cpm[NWID][MWID]) { #pragma unroll - for (int mi=0; mi<MWID; ++mi) { + for (int _mi = 0; _mi < MWID; _mi += 1) { #pragma unroll - for (int ni=0; ni<NWID; ++ni) { - SetToZero(cpm[ni][mi]); + for (int _ni = 0; _ni < NWID; _ni += 1) { + SetToZero(cpm[_ni][_mi]); } } } @@ -108,10 +108,10 @@ INLINE_FUNC void InitAccRegistersDirect(real cpm[NWID][MWID]) { // Performs the actual computation: Cpm += Apm * Bpm INLINE_FUNC void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) { #pragma unroll - for (int ni=0; ni<NWID; ++ni) { + for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll - for (int mi=0; mi<MWID; ++mi) { - MultiplyAdd(cpm[ni][mi], apm[mi], bpm[ni]); + for (int _mi = 0; _mi < MWID; _mi += 1) { + MultiplyAdd(cpm[_ni][_mi], apm[_mi], bpm[_ni]); } } } @@ -124,10 +124,10 @@ INLINE_FUNC void GlobalToPrivateDirectA(const __global real* restrict agms, real const int a_ld, const int a_offset, const int idm, const int idk, const int a_transpose, const int a_conjugate) { #pragma unroll - for (int mi=0; mi<MWID; ++mi) { - const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi); - apm[mi] = agms[a_index + a_offset]; - if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); } + for (int _mi = 0; _mi < MWID; _mi += 1) { + const int a_index = (a_transpose) ? (idm + _mi)*a_ld + idk : idk*a_ld + (idm + _mi); + apm[_mi] = agms[a_index + a_offset]; + if (a_conjugate) { COMPLEX_CONJUGATE(apm[_mi]); } } } @@ -136,10 +136,10 @@ INLINE_FUNC void GlobalToPrivateDirectB(const __global real* restrict bgms, real const int b_ld, const int b_offset, const int idn, const int idk, const int b_transpose, const int b_conjugate) { #pragma unroll - for (int ni=0; ni<NWID; ++ni) { - const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni); - bpm[ni] = bgms[b_index + b_offset]; - if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); } + for (int _ni = 0; _ni < NWID; _ni += 1) { + const int b_index = (b_transpose) ? (idn + _ni)*b_ld + idk : idk*b_ld + (idn + _ni); + bpm[_ni] = bgms[b_index + b_offset]; + if (b_conjugate) { COMPLEX_CONJUGATE(bpm[_ni]); } } } @@ -150,14 +150,14 @@ INLINE_FUNC void GlobalToPrivateCheckedA(const __global real* restrict agms, rea const int a_transpose, const int a_conjugate, const int kSizeM) { #pragma unroll - for (int mi=0; mi<MWID; ++mi) { - if (idm + mi < kSizeM) { - const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi); - apm[mi] = agms[a_index + a_offset]; - if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); } + for (int _mi = 0; _mi < MWID; _mi += 1) { + if (idm + _mi < kSizeM) { + const int a_index = (a_transpose) ? (idm + _mi)*a_ld + idk : idk*a_ld + (idm + _mi); + apm[_mi] = agms[a_index + a_offset]; + if (a_conjugate) { COMPLEX_CONJUGATE(apm[_mi]); } } else { - SetToZero(apm[mi]); + SetToZero(apm[_mi]); } } } @@ -168,14 +168,14 @@ INLINE_FUNC void GlobalToPrivateCheckedB(const __global real* restrict bgms, rea const int b_transpose, const int b_conjugate, const int kSizeN) { #pragma unroll - for (int ni=0; ni<NWID; ++ni) { - if (idn + ni < kSizeN) { - const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni); - bpm[ni] = bgms[b_index + b_offset]; - if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); } + for (int _ni = 0; _ni < NWID; _ni += 1) { + if (idn + _ni < kSizeN) { + const int b_index = (b_transpose) ? (idn + _ni)*b_ld + idk : idk*b_ld + (idn + _ni); + bpm[_ni] = bgms[b_index + b_offset]; + if (b_conjugate) { COMPLEX_CONJUGATE(bpm[_ni]); } } else { - SetToZero(bpm[ni]); + SetToZero(bpm[_ni]); } } } @@ -187,10 +187,10 @@ INLINE_FUNC void GlobalToPrivateCheckedB(const __global real* restrict bgms, rea INLINE_FUNC void LocalToPrivateDirectA(LOCAL_PTR real* alm, real apm[MWID], const int kg, const int a_transpose) { #pragma unroll - for (int mi=0; mi<MWID; ++mi) { - const int mg = mi + get_local_id(0)*MWID; + for (int _mi = 0; _mi < MWID; _mi += 1) { + const int mg = _mi + get_local_id(0)*MWID; const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg; - apm[mi] = alm[index]; + apm[_mi] = alm[index]; } } @@ -198,10 +198,10 @@ INLINE_FUNC void LocalToPrivateDirectA(LOCAL_PTR real* alm, real apm[MWID], cons INLINE_FUNC void LocalToPrivateDirectB(LOCAL_PTR real* blm, real bpm[NWID], const int kg, const int b_transpose) { #pragma unroll - for (int ni=0; ni<NWID; ++ni) { - const int ng = ni + get_local_id(1)*NWID; + for (int _ni = 0; _ni < NWID; _ni += 1) { + const int ng = _ni + get_local_id(1)*NWID; const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng; - bpm[ni] = blm[index]; + bpm[_ni] = blm[index]; } } @@ -214,21 +214,21 @@ INLINE_FUNC void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], const real alpha, const real beta, const int c_ld, const int c_offset, const int c_transpose) { #pragma unroll - for (int ni=0; ni<NWID; ++ni) { + for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll - for (int mi=0; mi<MWID; ++mi) { + for (int _mi = 0; _mi < MWID; _mi += 1) { - // Determines the destination index - int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi); + // Deter_mines the destination index + int c_index = (c_transpose) ? (idm + _mi)*c_ld + (idn + _ni) : (idn + _ni)*c_ld + (idm + _mi); // The final multiplication with alpha (in case beta == 0) real result; if (IsZero(beta)) { - Multiply(result, alpha, cpm[ni][mi]); + Multiply(result, alpha, cpm[_ni][_mi]); } // The final multiplication with alpha and the addition with beta*C else { - AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]); + AXPBY(result, alpha, cpm[_ni][_mi], beta, cgm[c_index + c_offset]); } cgm[c_index + c_offset] = result; } @@ -242,22 +242,22 @@ INLINE_FUNC void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID], const real alpha, const real beta, const int c_ld, const int c_offset, const int c_transpose) { #pragma unroll - for (int ni=0; ni<NWID; ++ni) { + for (int _ni = 0; _ni < NWID; _ni += 1) { #pragma unroll - for (int mi=0; mi<MWID; ++mi) { - if ((idm + mi) < kSizeM && (idn + ni) < kSizeN) { + for (int _mi = 0; _mi < MWID; _mi += 1) { + if ((idm + _mi) < kSizeM && (idn + _ni) < kSizeN) { - // Determines the destination index - int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi); + // Deter_mines the destination index + int c_index = (c_transpose) ? (idm + _mi)*c_ld + (idn + _ni) : (idn + _ni)*c_ld + (idm + _mi); // The final multiplication with alpha (in case beta == 0) real result; if (IsZero(beta)) { - Multiply(result, alpha, cpm[ni][mi]); + Multiply(result, alpha, cpm[_ni][_mi]); } // The final multiplication with alpha and the addition with beta*C else { - AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]); + AXPBY(result, alpha, cpm[_ni][_mi], beta, cgm[c_index + c_offset]); } cgm[c_index + c_offset] = result; } |