Make local memory pointers a define in OpenCL; some fixes to the recently changed transpose kernel code

author: Cedric Nugteren <web@cedricnugteren.nl> 2017-10-14 17:13:54 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2017-10-14 17:13:54 +0200
commit: b06bc01da90983ce484fded4e1a87f5fcd5c4eca (patch)
tree: bd835ea8612fc15747c84736c355b4ceb3fb70ff /src/kernels/level3/xgemm_part1.opencl
parent: d9456306e0ee16f18f788106cd8ba74a81bf2e31 (diff)
1 files changed, 4 insertions, 4 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 07dafe13..172b3c6b 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -186,7 +186,7 @@ INLINE_FUNC void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
 #if SA == 1
-INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
+INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, LOCAL_PTR realM* alm,
                                 const int kSizeM, const int tid, const int kwg) {
   const int la0 = tid % MDIMA;
   const int la1 = tid / MDIMA;
@@ -216,7 +216,7 @@ INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, __local real
 
 // Same as above, but now for the B input matrix
 #if SB == 1
-INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
+INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, LOCAL_PTR realN* blm,
                                 const int kSizeN, const int tid, const int kwg) {
   const int lb0 = tid % NDIMB;
   const int lb1 = tid / NDIMB;
@@ -298,7 +298,7 @@ INLINE_FUNC void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[
 // Caches on-chip local memory into per-thread private memory (registers). This function is specific
 // for caching the A input matrix.
 #if SA == 1
-INLINE_FUNC void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
+INLINE_FUNC void LocalToPrivateA(LOCAL_PTR realM* alm, realM apm[MWI/VWM], const int kg) {
   #pragma unroll
   for (int mi=0; mi<MWI/VWM; ++mi) {
     #if STRM == 0
@@ -313,7 +313,7 @@ INLINE_FUNC void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const i
 
 // Same as above, but now for the B input matrix
 #if SB == 1
-INLINE_FUNC void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
+INLINE_FUNC void LocalToPrivateB(LOCAL_PTR realN* blm, realN bpm[NWI/VWN], const int kg) {
   #pragma unroll
   for (int ni=0; ni<NWI/VWN; ++ni) {
     #if STRN == 0
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-10-14 17:13:54 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2017-10-14 17:13:54 +0200
commit	b06bc01da90983ce484fded4e1a87f5fcd5c4eca (patch)
tree	bd835ea8612fc15747c84736c355b4ceb3fb70ff /src/kernels/level3/xgemm_part1.opencl
parent	d9456306e0ee16f18f788106cd8ba74a81bf2e31 (diff)