diff options
author | cnugteren <web@cedricnugteren.nl> | 2016-05-15 14:04:34 +0200 |
---|---|---|
committer | cnugteren <web@cedricnugteren.nl> | 2016-05-15 14:04:34 +0200 |
commit | 9065b3468478818e9c5918380af665f2d499a322 (patch) | |
tree | eb505cb765d7375125b8423ce2f8079602efb408 /include/internal | |
parent | 1c72d225c53c123ed810cf3f56f5c92603f7f791 (diff) |
Added support for staggered/shuffled offsets for GEMM to improve performance for large power-of-2 kernels on AMD GPUs
Diffstat (limited to 'include/internal')
-rw-r--r-- | include/internal/tuning.h | 19 |
1 files changed, 15 insertions, 4 deletions
diff --git a/include/internal/tuning.h b/include/internal/tuning.h index 5645a5e5..215beb59 100644 --- a/include/internal/tuning.h +++ b/include/internal/tuning.h @@ -48,14 +48,18 @@ void Tuner(int argc, char* argv[]) { // Tests validity of the given arguments C::TestValidArguments(args); - // Tests for validity of the precision + // Tests for validity of the precision and retrieves properties + auto isAMD = false; + auto isGPU = false; { - auto platform = Platform(args.platform_id); - auto device = Device(platform, args.device_id); + const auto platform = Platform(args.platform_id); + const auto device = Device(platform, args.device_id); if (!PrecisionSupported<T>(device)) { printf("* Unsupported precision, skipping this tuning run\n\n"); return; } + isAMD = device.Vendor() == "AMD" || device.Vendor() == "Advanced Micro Devices, Inc."; + isGPU = device.Type() == "GPU"; } // Creates input buffers with random data @@ -84,8 +88,15 @@ void Tuner(int argc, char* argv[]) { tuner.UseRandomSearch(1.0/args.fraction); } + // Set extra settings for specific defines. This mimics src/routine.cc. + auto defines = std::string{""}; + if (isAMD && isGPU) { + defines += "#define USE_CL_MAD 1\n"; + defines += "#define USE_STAGGERED_INDICES 1\n"; + } + // Loads the kernel sources and defines the kernel to tune - auto sources = C::GetSources(); + auto sources = defines + C::GetSources(); auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize()); tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef()); |