4 files changed, 25 insertions, 5 deletions
diff --git a/include/internal/tuning.h b/include/internal/tuning.h
index 3eba6fdb..8fc79aff 100644
--- a/include/internal/tuning.h
+++ b/include/internal/tuning.h
@@ -52,6 +52,7 @@ void Tuner(int argc, char* argv[]) {
 
   // Tests for validity of the precision and retrieves properties
   auto isAMD = false;
+  auto isARM = false;
   auto isGPU = false;
   {
     const auto platform = Platform(args.platform_id);
@@ -61,6 +62,7 @@ void Tuner(int argc, char* argv[]) {
       return;
     }
     isAMD = device.Vendor() == "AMD" || device.Vendor() == "Advanced Micro Devices, Inc.";
+    isARM = device.Vendor() == "ARM";
     isGPU = device.Type() == "GPU";
   }
 
@@ -96,6 +98,9 @@ void Tuner(int argc, char* argv[]) {
     defines += "#define USE_CL_MAD 1\n";
     defines += "#define USE_STAGGERED_INDICES 1\n";
   }
+  if (isARM && isGPU) {
+    defines += "#define GLOBAL_MEM_FENCE 1\n";
+  }
 
   // Loads the kernel sources and defines the kernel to tune
   auto sources = defines + C::GetSources();
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index a2a555de..1ad0a558 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -7,10 +7,10 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
+// This file contains an optimized matrix-multiplication kernel inspired by the paper by Matsumoto
 // et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
 // (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
-// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
+// supports different data-types (SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM) through a pre-processor define.
 //
 // Matrices are accessed as follows:
 // A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
@@ -31,7 +31,7 @@
 //    o-------o        o-----o  
 //                              
 //
-// This kernel is seperated into two files. This is part 1 out of 2,
+// This kernel is seperated into two files. This is part 1 out of 2.
 //
 // =================================================================================================
 
@@ -68,7 +68,7 @@ R"(
   #define KWI 1      // Unroll factor of the KWG loop (smaller or equal than KWG)
 #endif
 #ifndef VWM
-  #define VWM 1      // Vector width of matrices A and C 
+  #define VWM 1      // Vector width of matrices A and C
 #endif
 #ifndef VWN
   #define VWN 1      // Vector width of matrix B
@@ -97,7 +97,12 @@ R"(
 #define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
 
 // Settings
-#define USE_VECTOR_MAD 0              // Unroll (0) or don't (1) unroll the vector MAD manually
+#ifndef USE_VECTOR_MAD
+  #define USE_VECTOR_MAD 0      // Unroll (0) or don't (1) unroll the vector MAD manually
+#endif
+#ifndef GLOBAL_MEM_FENCE
+  #define GLOBAL_MEM_FENCE 0    // Global synchronisation barrier for potential better performance
+#endif
 
 // =================================================================================================
 
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 56ccdb96..42c1127c 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -258,6 +258,9 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
       barrier(CLK_LOCAL_MEM_FENCE);
     #endif
   }
+  #if GLOBAL_MEM_FENCE == 1
+    barrier(CLK_GLOBAL_MEM_FENCE);
+  #endif
 }
 
 // =================================================================================================
diff --git a/src/routine.cc b/src/routine.cc
index 11c4281e..1acd814c 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -90,6 +90,7 @@ StatusCode Routine<T>::SetUp() {
 
   // Determines whether this is a specific device
   const auto isAMD = device_.Vendor() == "AMD" || device_.Vendor() == "Advanced Micro Devices, Inc.";
+  const auto isARM = device_.Vendor() == "ARM";
   const auto isGPU = device_.Type() == "GPU";
 
   // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
@@ -103,6 +104,12 @@ StatusCode Routine<T>::SetUp() {
     defines += "#define USE_STAGGERED_INDICES 1\n";
   }
 
+  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+  // performance through better cache behaviour
+  if (isARM && isGPU) {
+    defines += "#define GLOBAL_MEM_FENCE 1\n";
+  }
+
   // Combines everything together into a single source string
   auto source_string = defines + common_header + source_string_;