summaryrefslogtreecommitdiff
path: root/include/internal
diff options
context:
space:
mode:
authorcnugteren <web@cedricnugteren.nl>2016-05-15 14:04:34 +0200
committercnugteren <web@cedricnugteren.nl>2016-05-15 14:04:34 +0200
commit9065b3468478818e9c5918380af665f2d499a322 (patch)
treeeb505cb765d7375125b8423ce2f8079602efb408 /include/internal
parent1c72d225c53c123ed810cf3f56f5c92603f7f791 (diff)
Added support for staggered/shuffled offsets for GEMM to improve performance for large power-of-2 kernels on AMD GPUs
Diffstat (limited to 'include/internal')
-rw-r--r--include/internal/tuning.h19
1 files changed, 15 insertions, 4 deletions
diff --git a/include/internal/tuning.h b/include/internal/tuning.h
index 5645a5e5..215beb59 100644
--- a/include/internal/tuning.h
+++ b/include/internal/tuning.h
@@ -48,14 +48,18 @@ void Tuner(int argc, char* argv[]) {
// Tests validity of the given arguments
C::TestValidArguments(args);
- // Tests for validity of the precision
+ // Tests for validity of the precision and retrieves properties
+ auto isAMD = false;
+ auto isGPU = false;
{
- auto platform = Platform(args.platform_id);
- auto device = Device(platform, args.device_id);
+ const auto platform = Platform(args.platform_id);
+ const auto device = Device(platform, args.device_id);
if (!PrecisionSupported<T>(device)) {
printf("* Unsupported precision, skipping this tuning run\n\n");
return;
}
+ isAMD = device.Vendor() == "AMD" || device.Vendor() == "Advanced Micro Devices, Inc.";
+ isGPU = device.Type() == "GPU";
}
// Creates input buffers with random data
@@ -84,8 +88,15 @@ void Tuner(int argc, char* argv[]) {
tuner.UseRandomSearch(1.0/args.fraction);
}
+ // Set extra settings for specific defines. This mimics src/routine.cc.
+ auto defines = std::string{""};
+ if (isAMD && isGPU) {
+ defines += "#define USE_CL_MAD 1\n";
+ defines += "#define USE_STAGGERED_INDICES 1\n";
+ }
+
// Loads the kernel sources and defines the kernel to tune
- auto sources = C::GetSources();
+ auto sources = defines + C::GetSources();
auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize());
tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef());