From 7709a7308bce5492e06d8867a4dd9dff5b2ba950 Mon Sep 17 00:00:00 2001 From: Tyler Sorensen Date: Sat, 14 Jul 2018 19:50:47 -0400 Subject: Applied feedback from Cedric from first pull request --- src/utilities/compile.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'src/utilities/compile.cpp') diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index cd0b3d2b..835f54b4 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -58,24 +58,22 @@ std::shared_ptr CompileFromSource( header_string += "#define GLOBAL_MEM_FENCE 1\n"; } - // For GPUs with subgroup support, use subgroup shuffling. - // Currently these are Intel via an extension and Nvidia using inline PTX (restricted to 32 bit) - if (device.IsGPU() && (device.HasExtension(kKhronosIntelSubgroups) || - (device.IsNVIDIA() && static_cast(precision) == 32))) { + // For Intel GPUs with subgroup support, use subgroup shuffling. + if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) { header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; + header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n"; + } - // Define the flavor of subgroup - if (device.IsNVIDIA()) { - header_string += "#define NVIDIA_WARPS_AS_SUBGROUPS 1\n"; + // For NVIDIA GPUs, inline PTX can provide subgroup support + if (device.IsGPU() && device.IsNVIDIA() && precision == Precision::kSingle) { + header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; - // Nvidia additionally needs to check pre or post volta due to new - // shuffle commands - if (device.IsPostNVIDIAVolta()) { - header_string += "#define NVIDIA_POST_VOLTA 1\n"; - } + // Nvidia needs to check pre or post volta due to new shuffle commands + if (device.IsPostNVIDIAVolta()) { + header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 1\n"; } - else if (device.HasExtension(kKhronosIntelSubgroups)) { - header_string += "#define INTEL_SUBGROUP_EXTENSION 1\n"; + else { + header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n"; } } -- cgit v1.2.3