summaryrefslogtreecommitdiff
path: root/src/kernels
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-10-03 21:55:21 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2017-10-03 21:55:21 +0200
commit375193fe4e72b320eb63fbc6f98c24714f6970c2 (patch)
treee01cf66d16e0a0e7d024e6ce3ba24fd85803f1fd /src/kernels
parent74fd6767b93b03fc62462f44854215c4c320babe (diff)
Gemm in-direct implementation now uses only 1 larger instead of max 3 optional temporary buffers
Diffstat (limited to 'src/kernels')
-rw-r--r--src/kernels/level3/xgemm_part3.opencl9
1 files changed, 7 insertions, 2 deletions
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 3f0d590d..f447677f 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -17,7 +17,7 @@ R"(
// =================================================================================================
-// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.
INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
const __global realM* restrict agm, const __global realN* restrict bgm,
__global realM* cgm, realM cpm[NWI][MWI/VWM]
@@ -192,10 +192,15 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
- __global realM* cgm) {
+ __global realM* cgm,
+ const int b_offset, const int c_offset) {
const real alpha = GetRealArg(arg_alpha);
const real beta = GetRealArg(arg_beta);
+ // Adds the offsets (in case of use of a single temporary buffer for A, B, and C)
+ bgm = &bgm[b_offset];
+ cgm = &cgm[c_offset];
+
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];