From fa84ac36f23c1aebb5facf946b21d8c0f1a4a46d Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 28 Jul 2018 16:01:03 +0200 Subject: The tuners now also check for valid local thread configurations and skip invalid ones completely, saving compilation time --- CHANGELOG | 1 + src/tuning/configurations.cpp | 49 ++++++++++++++++++++++++++++++++++++------- src/tuning/configurations.hpp | 17 +++++++++++++-- src/tuning/tuning.cpp | 3 ++- src/tuning/tuning_api.cpp | 3 ++- 5 files changed, 62 insertions(+), 11 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index c1c639e1..f6d05df3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ Development (next version) - Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah') +- The tuners now check beforehand on invalid local thread sizes and skip those completely - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel - Various minor fixes and enhancements diff --git a/src/tuning/configurations.cpp b/src/tuning/configurations.cpp index 1fe232cf..82d7e3b4 100644 --- a/src/tuning/configurations.cpp +++ b/src/tuning/configurations.cpp @@ -23,28 +23,42 @@ namespace clblast { // Finds all configurations. It also applies the user-defined constraints within. std::vector SetConfigurations(const Device& device, const std::vector parameters, + const std::vector& local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info) { const auto local_mem_max = device.LocalMemSize(); + const auto max_work_item_sizes = device.MaxWorkItemSizes(); + const auto max_work_group_size = device.MaxWorkGroupSize(); auto config = Configuration(); auto configurations = std::vector(); - PopulateConfigurations(parameters, 0, config, configurations, - local_mem_max, constraints, local_mem_size_info); + PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config, + 0, config, configurations, + local_mem_max, constraints, local_mem_size_info, + max_work_item_sizes, max_work_group_size); return configurations; } // Iterates recursively over all permutations of the user-defined parameters void PopulateConfigurations(const std::vector ¶meters, + const std::vector local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const size_t index, const Configuration &config, std::vector &configuration, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info) { + const LocalMemSizeInfo& local_mem_size_info, + const std::vector& max_work_item_sizes, + const size_t max_work_group_size) { // End of the chain: all parameters are considered, store the resulting configuration if it is a // valid one according to the constraints if (index == parameters.size()) { - if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info)) { + if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info, + local_size_base, mul_local_config, div_local_config, + max_work_item_sizes, max_work_group_size)) { configuration.push_back(config); } return; @@ -55,8 +69,10 @@ void PopulateConfigurations(const std::vector ¶meters, for (auto &value: parameter.second) { auto config_copy = config; config_copy[parameter.first] = value; - PopulateConfigurations(parameters, index+1, config_copy, configuration, - local_mem_max, constraints, local_mem_size_info); + PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config, + index+1, config_copy, configuration, + local_mem_max, constraints, local_mem_size_info, + max_work_item_sizes, max_work_group_size); } } @@ -64,7 +80,12 @@ void PopulateConfigurations(const std::vector ¶meters, bool ValidConfiguration(const Configuration &config, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info) { + const LocalMemSizeInfo& local_mem_size_info, + const std::vector local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, + const std::vector& max_work_item_sizes, + const size_t max_work_group_size) { // Iterates over all constraints for (auto &constraint: constraints) { @@ -92,6 +113,20 @@ bool ValidConfiguration(const Configuration &config, return false; } + // Checks the local thread size (both per dimension and in total) + const auto local = SetThreadConfiguration(config, local_size_base, + mul_local_config, div_local_config); + for (auto i=size_t{0}; i max_work_item_sizes[i]) { + return false; + } + } + auto local_size = size_t{1}; + for (auto &item: local) { local_size *= item; } + if (local_size > max_work_group_size) { + return false; + } + // Everything was OK: this configuration is valid return true; } diff --git a/src/tuning/configurations.hpp b/src/tuning/configurations.hpp index faa5498f..4b9ba93f 100644 --- a/src/tuning/configurations.hpp +++ b/src/tuning/configurations.hpp @@ -50,6 +50,9 @@ struct LocalMemSizeInfo { // function to find all configurations. It also applies the user-defined constraints within. std::vector SetConfigurations(const Device& device, const std::vector parameters, + const std::vector& local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info); @@ -58,11 +61,16 @@ std::vector SetConfigurations(const Device& device, // At the end of each chain (when all parameters are considered), the function stores the result // into the configuration list. void PopulateConfigurations(const std::vector ¶meters, + const std::vector local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const size_t index, const Configuration &config, std::vector &configuration, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info); + const LocalMemSizeInfo& local_mem_size_info, + const std::vector& max_work_item_sizes, + const size_t max_work_group_size); // Loops over all user-defined constraints to check whether or not the configuration is valid. // Assumes initially all configurations are valid, then returns false if one of the constraints has @@ -71,7 +79,12 @@ void PopulateConfigurations(const std::vector ¶meters, bool ValidConfiguration(const Configuration &config, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info); + const LocalMemSizeInfo& local_mem_size_info, + const std::vector local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, + const std::vector& max_work_item_sizes, + const size_t max_work_group_size); // Processes multipliers and dividers to obtain the final thread configuration std::vector SetThreadConfiguration(const Configuration& config, diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp index 822f8851..d382fb18 100644 --- a/src/tuning/tuning.cpp +++ b/src/tuning/tuning.cpp @@ -172,7 +172,8 @@ void Tuner(int argc, char* argv[], const int V, } // Sets the tunable parameters and their possible values - auto configurations = SetConfigurations(device, settings.parameters, + auto configurations = SetConfigurations(device, settings.parameters, settings.local_size, + settings.mul_local, settings.div_local, SetConstraints(V), ComputeLocalMemSize(V)); printf("* Found %s%zu configuration(s)%s\n", kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str()); diff --git a/src/tuning/tuning_api.cpp b/src/tuning/tuning_api.cpp index 2eec2e2e..2cc9b786 100644 --- a/src/tuning/tuning_api.cpp +++ b/src/tuning/tuning_api.cpp @@ -264,7 +264,8 @@ StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, } // Sets the tunable parameters and their possible values - auto configurations = SetConfigurations(device, settings.parameters, + auto configurations = SetConfigurations(device, settings.parameters, settings.local_size, + settings.mul_local, settings.div_local, SetConstraints(V), ComputeLocalMemSize(V)); // Select the search method (full search or a random fraction) -- cgit v1.2.3