diff options
Diffstat (limited to 'src/routine.cc')
-rw-r--r-- | src/routine.cc | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/src/routine.cc b/src/routine.cc index 11c4281e..1acd814c 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -90,6 +90,7 @@ StatusCode Routine<T>::SetUp() { // Determines whether this is a specific device const auto isAMD = device_.Vendor() == "AMD" || device_.Vendor() == "Advanced Micro Devices, Inc."; + const auto isARM = device_.Vendor() == "ARM"; const auto isGPU = device_.Type() == "GPU"; // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve @@ -103,6 +104,12 @@ StatusCode Routine<T>::SetUp() { defines += "#define USE_STAGGERED_INDICES 1\n"; } + // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize + // performance through better cache behaviour + if (isARM && isGPU) { + defines += "#define GLOBAL_MEM_FENCE 1\n"; + } + // Combines everything together into a single source string auto source_string = defines + common_header + source_string_; |