summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-06-08 10:13:37 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-06-08 10:13:37 +0200
commit6925003e45e5c681aaeb26c95ffa29275cebcaac (patch)
treeef088f43ab7aeec82ecfe85cec9c145ef1d46ff0 /src
parent6d6b0300532a48fe9f638898b630891d38173538 (diff)
Added global memory synchronisation for better cache performance on ARM Mali GPUs
Diffstat (limited to 'src')
-rw-r--r--src/kernels/level3/xgemm_part1.opencl15
-rw-r--r--src/kernels/level3/xgemm_part2.opencl3
-rw-r--r--src/routine.cc7
3 files changed, 20 insertions, 5 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index a2a555de..1ad0a558 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -7,10 +7,10 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
-// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
+// This file contains an optimized matrix-multiplication kernel inspired by the paper by Matsumoto
// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
-// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
+// supports different data-types (SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM) through a pre-processor define.
//
// Matrices are accessed as follows:
// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
@@ -31,7 +31,7 @@
// o-------o o-----o
//
//
-// This kernel is seperated into two files. This is part 1 out of 2,
+// This kernel is seperated into two files. This is part 1 out of 2.
//
// =================================================================================================
@@ -68,7 +68,7 @@ R"(
#define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG)
#endif
#ifndef VWM
- #define VWM 1 // Vector width of matrices A and C
+ #define VWM 1 // Vector width of matrices A and C
#endif
#ifndef VWN
#define VWN 1 // Vector width of matrix B
@@ -97,7 +97,12 @@ R"(
#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension)
// Settings
-#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually
+#ifndef USE_VECTOR_MAD
+ #define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually
+#endif
+#ifndef GLOBAL_MEM_FENCE
+ #define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance
+#endif
// =================================================================================================
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 56ccdb96..42c1127c 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -258,6 +258,9 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
barrier(CLK_LOCAL_MEM_FENCE);
#endif
}
+ #if GLOBAL_MEM_FENCE == 1
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ #endif
}
// =================================================================================================
diff --git a/src/routine.cc b/src/routine.cc
index 11c4281e..1acd814c 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -90,6 +90,7 @@ StatusCode Routine<T>::SetUp() {
// Determines whether this is a specific device
const auto isAMD = device_.Vendor() == "AMD" || device_.Vendor() == "Advanced Micro Devices, Inc.";
+ const auto isARM = device_.Vendor() == "ARM";
const auto isGPU = device_.Type() == "GPU";
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
@@ -103,6 +104,12 @@ StatusCode Routine<T>::SetUp() {
defines += "#define USE_STAGGERED_INDICES 1\n";
}
+ // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+ // performance through better cache behaviour
+ if (isARM && isGPU) {
+ defines += "#define GLOBAL_MEM_FENCE 1\n";
+ }
+
// Combines everything together into a single source string
auto source_string = defines + common_header + source_string_;