diff options
Diffstat (limited to 'src/kernels/level3/xgemm_direct_part2.opencl')
-rw-r--r-- | src/kernels/level3/xgemm_direct_part2.opencl | 12 |
1 files changed, 6 insertions, 6 deletions
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl index 1d9330fc..c3bf1b80 100644 --- a/src/kernels/level3/xgemm_direct_part2.opencl +++ b/src/kernels/level3/xgemm_direct_part2.opencl @@ -19,7 +19,7 @@ R"( // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. -INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, +INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, LOCAL_PTR real* alm, const int a_ld, const int a_offset, const int kwg, const int a_transpose, const int a_conjugate) { #if MDIMCD == MDIMAD @@ -90,7 +90,7 @@ INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __loc } // Same as above, but now for the B input matrix -INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm, +INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, LOCAL_PTR real* blm, const int b_ld, const int b_offset, const int kwg, const int b_transpose, const int b_conjugate) { #if MDIMCD == NDIMBD @@ -165,7 +165,7 @@ INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __loc // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. In contrast to the functions above, this function performs doesn't // use the vector data-types. -INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm, +INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, LOCAL_PTR real* alm, const int a_ld, const int a_offset, const int kwg, const int a_transpose, const int a_conjugate) { #if MDIMCD == MDIMAD @@ -196,7 +196,7 @@ INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __loca } // Same as above, but now for the B input matrix -INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm, +INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, LOCAL_PTR real* blm, const int b_ld, const int b_offset, const int kwg, const int b_transpose, const int b_conjugate) { #if MDIMCD == NDIMBD @@ -231,7 +231,7 @@ INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __loca // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. In contrast to the functions above, this function performs bounds // checks and doesn't use the vector data-types. -INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm, +INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, LOCAL_PTR real* alm, const int a_ld, const int a_offset, const int kwg, const int a_transpose, const int a_conjugate, const int kSizeM, const int kSizeK) { @@ -270,7 +270,7 @@ INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __loc } // Same as above, but now for the B input matrix -INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm, +INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, LOCAL_PTR real* blm, const int b_ld, const int b_offset, const int kwg, const int b_transpose, const int b_conjugate, const int kSizeN, const int kSizeK) { |