summaryrefslogtreecommitdiff
path: root/src/kernels/level3/xgemm_direct_part2.opencl
diff options
context:
space:
mode:
Diffstat (limited to 'src/kernels/level3/xgemm_direct_part2.opencl')
-rw-r--r--src/kernels/level3/xgemm_direct_part2.opencl12
1 files changed, 6 insertions, 6 deletions
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
index 1d9330fc..c3bf1b80 100644
--- a/src/kernels/level3/xgemm_direct_part2.opencl
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -19,7 +19,7 @@ R"(
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
// caching the A input matrix.
-INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
+INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, LOCAL_PTR real* alm,
const int a_ld, const int a_offset, const int kwg,
const int a_transpose, const int a_conjugate) {
#if MDIMCD == MDIMAD
@@ -90,7 +90,7 @@ INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __loc
}
// Same as above, but now for the B input matrix
-INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
+INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, LOCAL_PTR real* blm,
const int b_ld, const int b_offset, const int kwg,
const int b_transpose, const int b_conjugate) {
#if MDIMCD == NDIMBD
@@ -165,7 +165,7 @@ INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __loc
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
// caching the A input matrix. In contrast to the functions above, this function performs doesn't
// use the vector data-types.
-INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
+INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, LOCAL_PTR real* alm,
const int a_ld, const int a_offset, const int kwg,
const int a_transpose, const int a_conjugate) {
#if MDIMCD == MDIMAD
@@ -196,7 +196,7 @@ INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __loca
}
// Same as above, but now for the B input matrix
-INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
+INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, LOCAL_PTR real* blm,
const int b_ld, const int b_offset, const int kwg,
const int b_transpose, const int b_conjugate) {
#if MDIMCD == NDIMBD
@@ -231,7 +231,7 @@ INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __loca
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
// caching the A input matrix. In contrast to the functions above, this function performs bounds
// checks and doesn't use the vector data-types.
-INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
+INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, LOCAL_PTR real* alm,
const int a_ld, const int a_offset, const int kwg,
const int a_transpose, const int a_conjugate,
const int kSizeM, const int kSizeK) {
@@ -270,7 +270,7 @@ INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __loc
}
// Same as above, but now for the B input matrix
-INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
+INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, LOCAL_PTR real* blm,
const int b_ld, const int b_offset, const int kwg,
const int b_transpose, const int b_conjugate,
const int kSizeN, const int kSizeK) {