diff options
-rw-r--r-- | include/internal/tuning.h | 5 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part1.opencl | 15 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part2.opencl | 3 | ||||
-rw-r--r-- | src/routine.cc | 7 |
4 files changed, 25 insertions, 5 deletions
diff --git a/include/internal/tuning.h b/include/internal/tuning.h index 3eba6fdb..8fc79aff 100644 --- a/include/internal/tuning.h +++ b/include/internal/tuning.h @@ -52,6 +52,7 @@ void Tuner(int argc, char* argv[]) { // Tests for validity of the precision and retrieves properties auto isAMD = false; + auto isARM = false; auto isGPU = false; { const auto platform = Platform(args.platform_id); @@ -61,6 +62,7 @@ void Tuner(int argc, char* argv[]) { return; } isAMD = device.Vendor() == "AMD" || device.Vendor() == "Advanced Micro Devices, Inc."; + isARM = device.Vendor() == "ARM"; isGPU = device.Type() == "GPU"; } @@ -96,6 +98,9 @@ void Tuner(int argc, char* argv[]) { defines += "#define USE_CL_MAD 1\n"; defines += "#define USE_STAGGERED_INDICES 1\n"; } + if (isARM && isGPU) { + defines += "#define GLOBAL_MEM_FENCE 1\n"; + } // Loads the kernel sources and defines the kernel to tune auto sources = defines + C::GetSources(); diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index a2a555de..1ad0a558 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -7,10 +7,10 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto +// This file contains an optimized matrix-multiplication kernel inspired by the paper by Matsumoto // et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable // (and tunable!) using more or less the same parameters/naming conventions as in the paper. It -// supports single and double precision (SGEMM/DGEMM) through a pre-processor define. +// supports different data-types (SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM) through a pre-processor define. // // Matrices are accessed as follows: // A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m) @@ -31,7 +31,7 @@ // o-------o o-----o // // -// This kernel is seperated into two files. This is part 1 out of 2, +// This kernel is seperated into two files. This is part 1 out of 2. // // ================================================================================================= @@ -68,7 +68,7 @@ R"( #define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG) #endif #ifndef VWM - #define VWM 1 // Vector width of matrices A and C + #define VWM 1 // Vector width of matrices A and C #endif #ifndef VWN #define VWN 1 // Vector width of matrix B @@ -97,7 +97,12 @@ R"( #define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension) // Settings -#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually +#ifndef USE_VECTOR_MAD + #define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually +#endif +#ifndef GLOBAL_MEM_FENCE + #define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance +#endif // ================================================================================================= diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index 56ccdb96..42c1127c 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -258,6 +258,9 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, barrier(CLK_LOCAL_MEM_FENCE); #endif } + #if GLOBAL_MEM_FENCE == 1 + barrier(CLK_GLOBAL_MEM_FENCE); + #endif } // ================================================================================================= diff --git a/src/routine.cc b/src/routine.cc index 11c4281e..1acd814c 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -90,6 +90,7 @@ StatusCode Routine<T>::SetUp() { // Determines whether this is a specific device const auto isAMD = device_.Vendor() == "AMD" || device_.Vendor() == "Advanced Micro Devices, Inc."; + const auto isARM = device_.Vendor() == "ARM"; const auto isGPU = device_.Type() == "GPU"; // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve @@ -103,6 +104,12 @@ StatusCode Routine<T>::SetUp() { defines += "#define USE_STAGGERED_INDICES 1\n"; } + // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize + // performance through better cache behaviour + if (isARM && isGPU) { + defines += "#define GLOBAL_MEM_FENCE 1\n"; + } + // Combines everything together into a single source string auto source_string = defines + common_header + source_string_; |