Made the inline keyword in kernels optional currently only enabled for NVIDIA and ARM GPUs

author: Cedric Nugteren <web@cedricnugteren.nl> 2017-07-08 17:12:16 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2017-07-08 17:12:16 +0200
commit: 442c31dd508c573023594a803160ddb69d4929f2 (patch)
tree: 55474d09086481117204626b27cbec4ee465be9a /src/kernels/level3/xgemm_part1.opencl
parent: 75c0e861b842dbd08def5e55696fd79d713afc96 (diff)
1 files changed, 11 insertions, 11 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index d0ce06ad..07dafe13 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -135,7 +135,7 @@ R"(
 // =================================================================================================
 
 // Initializes the accumulation registers to zero
-inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
+INLINE_FUNC void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
   #pragma unroll
   for (int mi=0; mi<MWI/VWM; ++mi) {
     #pragma unroll
@@ -186,8 +186,8 @@ inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
 #if SA == 1
-inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
-                           const int kSizeM, const int tid, const int kwg) {
+INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
+                                const int kSizeM, const int tid, const int kwg) {
   const int la0 = tid % MDIMA;
   const int la1 = tid / MDIMA;
   #pragma unroll
@@ -216,8 +216,8 @@ inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* al
 
 // Same as above, but now for the B input matrix
 #if SB == 1
-inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
-                           const int kSizeN, const int tid, const int kwg) {
+INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
+                                const int kSizeN, const int tid, const int kwg) {
   const int lb0 = tid % NDIMB;
   const int lb1 = tid / NDIMB;
   #pragma unroll
@@ -249,8 +249,8 @@ inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* bl
 // Caches global off-chip memory directly into per-thread private memory (registers). This function
 // is specific for caching the A input matrix.
 #if SA == 0
-inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
-                             const int kSizeM, const int idk, const int kwg) {
+INLINE_FUNC void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
+                                  const int kSizeM, const int idk, const int kwg) {
   #pragma unroll
   for (int mi=0; mi<MWI/VWM; ++mi) {
 
@@ -272,8 +272,8 @@ inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/V
 
 // Same as above, but now for the B input matrix
 #if SB == 0
-inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
-                             const int kSizeN, const int idk) {
+INLINE_FUNC void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
+                                  const int kSizeN, const int idk) {
   #pragma unroll
   for (int ni=0; ni<NWI/VWN; ++ni) {
 
@@ -298,7 +298,7 @@ inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/V
 // Caches on-chip local memory into per-thread private memory (registers). This function is specific
 // for caching the A input matrix.
 #if SA == 1
-inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
+INLINE_FUNC void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
   #pragma unroll
   for (int mi=0; mi<MWI/VWM; ++mi) {
     #if STRM == 0
@@ -313,7 +313,7 @@ inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg
 
 // Same as above, but now for the B input matrix
 #if SB == 1
-inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
+INLINE_FUNC void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
   #pragma unroll
   for (int ni=0; ni<NWI/VWN; ++ni) {
     #if STRN == 0
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-07-08 17:12:16 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2017-07-08 17:12:16 +0200
commit	442c31dd508c573023594a803160ddb69d4929f2 (patch)
tree	55474d09086481117204626b27cbec4ee465be9a /src/kernels/level3/xgemm_part1.opencl
parent	75c0e861b842dbd08def5e55696fd79d713afc96 (diff)