summaryrefslogtreecommitdiff
path: root/src/kernels/level3/xgemm_part1.opencl
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-07-08 17:12:16 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2017-07-08 17:12:16 +0200
commit442c31dd508c573023594a803160ddb69d4929f2 (patch)
tree55474d09086481117204626b27cbec4ee465be9a /src/kernels/level3/xgemm_part1.opencl
parent75c0e861b842dbd08def5e55696fd79d713afc96 (diff)
Made the inline keyword in kernels optional currently only enabled for NVIDIA and ARM GPUs
Diffstat (limited to 'src/kernels/level3/xgemm_part1.opencl')
-rw-r--r--src/kernels/level3/xgemm_part1.opencl22
1 files changed, 11 insertions, 11 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index d0ce06ad..07dafe13 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -135,7 +135,7 @@ R"(
// =================================================================================================
// Initializes the accumulation registers to zero
-inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
+INLINE_FUNC void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#pragma unroll
@@ -186,8 +186,8 @@ inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
// caching the A input matrix.
#if SA == 1
-inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
- const int kSizeM, const int tid, const int kwg) {
+INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
+ const int kSizeM, const int tid, const int kwg) {
const int la0 = tid % MDIMA;
const int la1 = tid / MDIMA;
#pragma unroll
@@ -216,8 +216,8 @@ inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* al
// Same as above, but now for the B input matrix
#if SB == 1
-inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
- const int kSizeN, const int tid, const int kwg) {
+INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
+ const int kSizeN, const int tid, const int kwg) {
const int lb0 = tid % NDIMB;
const int lb1 = tid / NDIMB;
#pragma unroll
@@ -249,8 +249,8 @@ inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* bl
// Caches global off-chip memory directly into per-thread private memory (registers). This function
// is specific for caching the A input matrix.
#if SA == 0
-inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
- const int kSizeM, const int idk, const int kwg) {
+INLINE_FUNC void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
+ const int kSizeM, const int idk, const int kwg) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
@@ -272,8 +272,8 @@ inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/V
// Same as above, but now for the B input matrix
#if SB == 0
-inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
- const int kSizeN, const int idk) {
+INLINE_FUNC void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
+ const int kSizeN, const int idk) {
#pragma unroll
for (int ni=0; ni<NWI/VWN; ++ni) {
@@ -298,7 +298,7 @@ inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/V
// Caches on-chip local memory into per-thread private memory (registers). This function is specific
// for caching the A input matrix.
#if SA == 1
-inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
+INLINE_FUNC void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#if STRM == 0
@@ -313,7 +313,7 @@ inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg
// Same as above, but now for the B input matrix
#if SB == 1
-inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
+INLINE_FUNC void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
#pragma unroll
for (int ni=0; ni<NWI/VWN; ++ni) {
#if STRN == 0