diff options
Diffstat (limited to 'src/kernels/level3/xgemm_part3.opencl')
-rw-r--r-- | src/kernels/level3/xgemm_part3.opencl | 15 |
1 files changed, 10 insertions, 5 deletions
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index 3f0d590d..ce24907c 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -17,16 +17,16 @@ R"( // ================================================================================================= -// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above. +// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions. INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm, realM cpm[NWI][MWI/VWM] #if SA == 1 && SB == 1 - , __local realM* alm, __local realN* blm + , LOCAL_PTR realM* alm, LOCAL_PTR realN* blm #elif SA == 1 - , __local realM* alm + , LOCAL_PTR realM* alm #elif SB == 1 - , __local realN* blm + , LOCAL_PTR realN* blm #endif ) { @@ -192,10 +192,15 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_beta, const __global realM* restrict agm, const __global realN* restrict bgm, - __global realM* cgm) { + __global realM* cgm, + const int b_offset, const int c_offset) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); + // Adds the offsets (in case of use of a single temporary buffer for A, B, and C) + bgm = &bgm[b_offset]; + cgm = &cgm[c_offset]; + // Allocates workgroup-private memory (local memory) #if SA == 1 __local realM alm[KWG * MWG/VWM]; |