From 1cbe2ea301c6b28a7d1101142ff347471f7dc197 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 23 Mar 2018 20:29:20 +0100 Subject: Removed arrays as function argument from GEMM kernels for Vivante OpenCL compiler --- src/kernels/level3/xgemm_direct_part1.opencl | 75 ++++++++++++---------------- 1 file changed, 32 insertions(+), 43 deletions(-) (limited to 'src/kernels/level3/xgemm_direct_part1.opencl') diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl index 38aa31fb..8ca2ceb4 100644 --- a/src/kernels/level3/xgemm_direct_part1.opencl +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -171,59 +171,48 @@ INLINE_FUNC real LocalToPrivateDirectB(LOCAL_PTR real* blm, const int _ni, const // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm -INLINE_FUNC void StoreResultsDirect(__global real* cgm, real cpd[NWID * MWID], - const int idm, const int idn, +INLINE_FUNC void StoreResultsDirect(__global real* cgm, const real c_value, + const int _mi, const int _ni, const int idm, const int idn, const real alpha, const real beta, const int c_ld, const int c_offset, const int c_transpose) { - #pragma unroll - for (int _ni = 0; _ni < NWID; _ni += 1) { - #pragma unroll - for (int _mi = 0; _mi < MWID; _mi += 1) { - - // Deter_mines the destination index - int c_index = (c_transpose) ? (idm + _mi)*c_ld + (idn + _ni) : (idn + _ni)*c_ld + (idm + _mi); - - // The final multiplication with alpha (in case beta == 0) - real result; - if (IsZero(beta)) { - Multiply(result, alpha, cpd[_ni * MWID + _mi]); - } - // The final multiplication with alpha and the addition with beta*C - else { - AXPBY(result, alpha, cpd[_ni * MWID + _mi], beta, cgm[c_index + c_offset]); - } - cgm[c_index + c_offset] = result; - } + + // Determines the destination index + int c_index = (c_transpose) ? (idm + _mi)*c_ld + (idn + _ni) : (idn + _ni)*c_ld + (idm + _mi); + + // The final multiplication with alpha (in case beta == 0) + real result; + if (IsZero(beta)) { + Multiply(result, alpha, c_value); } + // The final multiplication with alpha and the addition with beta*C + else { + AXPBY(result, alpha, c_value, beta, cgm[c_index + c_offset]); + } + cgm[c_index + c_offset] = result; } // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm -INLINE_FUNC void StoreResultsChecked(__global real* cgm, real cpd[NWID * MWID], - const int idm, const int idn, const int kSizeM, const int kSizeN, +INLINE_FUNC void StoreResultsChecked(__global real* cgm, const real c_value, + const int _mi, const int _ni, const int idm, const int idn, + const int kSizeM, const int kSizeN, const real alpha, const real beta, const int c_ld, const int c_offset, const int c_transpose) { - #pragma unroll - for (int _ni = 0; _ni < NWID; _ni += 1) { - #pragma unroll - for (int _mi = 0; _mi < MWID; _mi += 1) { - if ((idm + _mi) < kSizeM && (idn + _ni) < kSizeN) { - - // Deter_mines the destination index - int c_index = (c_transpose) ? (idm + _mi)*c_ld + (idn + _ni) : (idn + _ni)*c_ld + (idm + _mi); - - // The final multiplication with alpha (in case beta == 0) - real result; - if (IsZero(beta)) { - Multiply(result, alpha, cpd[_ni * MWID + _mi]); - } - // The final multiplication with alpha and the addition with beta*C - else { - AXPBY(result, alpha, cpd[_ni * MWID + _mi], beta, cgm[c_index + c_offset]); - } - cgm[c_index + c_offset] = result; - } + if ((idm + _mi) < kSizeM && (idn + _ni) < kSizeN) { + + // Deter_mines the destination index + int c_index = (c_transpose) ? (idm + _mi)*c_ld + (idn + _ni) : (idn + _ni)*c_ld + (idm + _mi); + + // The final multiplication with alpha (in case beta == 0) + real result; + if (IsZero(beta)) { + Multiply(result, alpha, c_value); + } + // The final multiplication with alpha and the addition with beta*C + else { + AXPBY(result, alpha, c_value, beta, cgm[c_index + c_offset]); } + cgm[c_index + c_offset] = result; } } -- cgit v1.2.3