1 files changed, 6 insertions, 6 deletions
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
index 1d9330fc..c3bf1b80 100644
--- a/src/kernels/level3/xgemm_direct_part2.opencl
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -19,7 +19,7 @@ R"(
 
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
-INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
+INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, LOCAL_PTR real* alm,
                                       const int a_ld, const int a_offset, const int kwg,
                                       const int a_transpose, const int a_conjugate) {
   #if MDIMCD == MDIMAD
@@ -90,7 +90,7 @@ INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __loc
 }
 
 // Same as above, but now for the B input matrix
-INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
+INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, LOCAL_PTR real* blm,
                                       const int b_ld, const int b_offset, const int kwg,
                                       const int b_transpose, const int b_conjugate) {
   #if MDIMCD == NDIMBD
@@ -165,7 +165,7 @@ INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __loc
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix. In contrast to the functions above, this function performs doesn't
 // use the vector data-types.
-INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
+INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, LOCAL_PTR real* alm,
                                       const int a_ld, const int a_offset, const int kwg,
                                       const int a_transpose, const int a_conjugate) {
   #if MDIMCD == MDIMAD
@@ -196,7 +196,7 @@ INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __loca
 }
 
 // Same as above, but now for the B input matrix
-INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
+INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, LOCAL_PTR real* blm,
                                       const int b_ld, const int b_offset, const int kwg,
                                       const int b_transpose, const int b_conjugate) {
   #if MDIMCD == NDIMBD
@@ -231,7 +231,7 @@ INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __loca
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix. In contrast to the functions above, this function performs bounds
 // checks and doesn't use the vector data-types.
-INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
+INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, LOCAL_PTR real* alm,
                                        const int a_ld, const int a_offset, const int kwg,
                                        const int a_transpose, const int a_conjugate,
                                        const int kSizeM, const int kSizeK) {
@@ -270,7 +270,7 @@ INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __loc
 }
 
 // Same as above, but now for the B input matrix
-INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
+INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, LOCAL_PTR real* blm,
                                        const int b_ld, const int b_offset, const int kwg,
                                        const int b_transpose, const int b_conjugate,
                                        const int kSizeN, const int kSizeK) {