summaryrefslogtreecommitdiff
path: root/src/kernels
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-12-18 11:54:32 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2016-12-18 11:54:32 +0100
commit6b533dda1ce8b4feda68708dec779ddc6200480c (patch)
tree54a041e38a35b567f59c15bc44afaae809747186 /src/kernels
parent26e017743191e188bc4ae7c7148a6025cfd74422 (diff)
Fixed a bug when using offsets in the direct GEMM kernels
Diffstat (limited to 'src/kernels')
-rw-r--r--src/kernels/level3/xgemm_direct_part2.opencl4
-rw-r--r--src/kernels/level3/xgemm_direct_part3.opencl4
2 files changed, 4 insertions, 4 deletions
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
index d77cbf65..fc09307e 100644
--- a/src/kernels/level3/xgemm_direct_part2.opencl
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -42,7 +42,7 @@ inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local re
int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
- const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
+ const realMD avec = agm[idk*(a_ld/VWMD) + idm + (a_offset/VWMD)];
#if VWMD == 1
alm[kg*(WGD + PADA) + mg] = avec;
#elif VWMD == 2
@@ -113,7 +113,7 @@ inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local re
int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
- const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
+ const realND bvec = bgm[idk*(b_ld/VWND) + idn + (b_offset/VWND)];
#if VWND == 1
blm[kg*(WGD + PADB) + ng] = bvec;
#elif VWND == 2
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
index a9350e00..c04cdeb8 100644
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -53,13 +53,13 @@ inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
// Loads data: off-chip --> local (matrix A and B)
- if (a_ld % VWMD == 0) {
+ if (a_ld % VWMD == 0 && a_offset % VWMD == 0) {
GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
}
else {
GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
}
- if (b_ld % VWND == 0) {
+ if (b_ld % VWND == 0 && b_offset % VWND == 0) {
GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
}
else {