summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-07-28 16:01:03 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2018-07-28 16:01:03 +0200
commitfa84ac36f23c1aebb5facf946b21d8c0f1a4a46d (patch)
tree624d3830c45d40561f2e88946f4fd37e5addca74
parentdda1e567f872d3d89f2f7cd890fb5b29ff98537c (diff)
The tuners now also check for valid local thread configurations and skip invalid ones completely, saving compilation time
-rw-r--r--CHANGELOG1
-rw-r--r--src/tuning/configurations.cpp49
-rw-r--r--src/tuning/configurations.hpp17
-rw-r--r--src/tuning/tuning.cpp3
-rw-r--r--src/tuning/tuning_api.cpp3
5 files changed, 62 insertions, 11 deletions
diff --git a/CHANGELOG b/CHANGELOG
index c1c639e1..f6d05df3 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,7 @@
Development (next version)
- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
+- The tuners now check beforehand on invalid local thread sizes and skip those completely
- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
- Various minor fixes and enhancements
diff --git a/src/tuning/configurations.cpp b/src/tuning/configurations.cpp
index 1fe232cf..82d7e3b4 100644
--- a/src/tuning/configurations.cpp
+++ b/src/tuning/configurations.cpp
@@ -23,28 +23,42 @@ namespace clblast {
// Finds all configurations. It also applies the user-defined constraints within.
std::vector<Configuration> SetConfigurations(const Device& device,
const std::vector<Parameter> parameters,
+ const std::vector<size_t>& local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const Constraints& constraints,
const LocalMemSizeInfo& local_mem_size_info) {
const auto local_mem_max = device.LocalMemSize();
+ const auto max_work_item_sizes = device.MaxWorkItemSizes();
+ const auto max_work_group_size = device.MaxWorkGroupSize();
auto config = Configuration();
auto configurations = std::vector<Configuration>();
- PopulateConfigurations(parameters, 0, config, configurations,
- local_mem_max, constraints, local_mem_size_info);
+ PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config,
+ 0, config, configurations,
+ local_mem_max, constraints, local_mem_size_info,
+ max_work_item_sizes, max_work_group_size);
return configurations;
}
// Iterates recursively over all permutations of the user-defined parameters
void PopulateConfigurations(const std::vector<Parameter> &parameters,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const size_t index, const Configuration &config,
std::vector<Configuration> &configuration,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info) {
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size) {
// End of the chain: all parameters are considered, store the resulting configuration if it is a
// valid one according to the constraints
if (index == parameters.size()) {
- if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info)) {
+ if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info,
+ local_size_base, mul_local_config, div_local_config,
+ max_work_item_sizes, max_work_group_size)) {
configuration.push_back(config);
}
return;
@@ -55,8 +69,10 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
for (auto &value: parameter.second) {
auto config_copy = config;
config_copy[parameter.first] = value;
- PopulateConfigurations(parameters, index+1, config_copy, configuration,
- local_mem_max, constraints, local_mem_size_info);
+ PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config,
+ index+1, config_copy, configuration,
+ local_mem_max, constraints, local_mem_size_info,
+ max_work_item_sizes, max_work_group_size);
}
}
@@ -64,7 +80,12 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
bool ValidConfiguration(const Configuration &config,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info) {
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size) {
// Iterates over all constraints
for (auto &constraint: constraints) {
@@ -92,6 +113,20 @@ bool ValidConfiguration(const Configuration &config,
return false;
}
+ // Checks the local thread size (both per dimension and in total)
+ const auto local = SetThreadConfiguration(config, local_size_base,
+ mul_local_config, div_local_config);
+ for (auto i=size_t{0}; i<local.size(); ++i) {
+ if (local[i] > max_work_item_sizes[i]) {
+ return false;
+ }
+ }
+ auto local_size = size_t{1};
+ for (auto &item: local) { local_size *= item; }
+ if (local_size > max_work_group_size) {
+ return false;
+ }
+
// Everything was OK: this configuration is valid
return true;
}
diff --git a/src/tuning/configurations.hpp b/src/tuning/configurations.hpp
index faa5498f..4b9ba93f 100644
--- a/src/tuning/configurations.hpp
+++ b/src/tuning/configurations.hpp
@@ -50,6 +50,9 @@ struct LocalMemSizeInfo {
// function to find all configurations. It also applies the user-defined constraints within.
std::vector<Configuration> SetConfigurations(const Device& device,
const std::vector<Parameter> parameters,
+ const std::vector<size_t>& local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const Constraints& constraints,
const LocalMemSizeInfo& local_mem_size_info);
@@ -58,11 +61,16 @@ std::vector<Configuration> SetConfigurations(const Device& device,
// At the end of each chain (when all parameters are considered), the function stores the result
// into the configuration list.
void PopulateConfigurations(const std::vector<Parameter> &parameters,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
const size_t index, const Configuration &config,
std::vector<Configuration> &configuration,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info);
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size);
// Loops over all user-defined constraints to check whether or not the configuration is valid.
// Assumes initially all configurations are valid, then returns false if one of the constraints has
@@ -71,7 +79,12 @@ void PopulateConfigurations(const std::vector<Parameter> &parameters,
bool ValidConfiguration(const Configuration &config,
const size_t local_mem_max,
const Constraints& constraints,
- const LocalMemSizeInfo& local_mem_size_info);
+ const LocalMemSizeInfo& local_mem_size_info,
+ const std::vector<size_t> local_size_base,
+ const TransformVector& mul_local_config,
+ const TransformVector& div_local_config,
+ const std::vector<size_t>& max_work_item_sizes,
+ const size_t max_work_group_size);
// Processes multipliers and dividers to obtain the final thread configuration
std::vector<size_t> SetThreadConfiguration(const Configuration& config,
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index 822f8851..d382fb18 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -172,7 +172,8 @@ void Tuner(int argc, char* argv[], const int V,
}
// Sets the tunable parameters and their possible values
- auto configurations = SetConfigurations(device, settings.parameters,
+ auto configurations = SetConfigurations(device, settings.parameters, settings.local_size,
+ settings.mul_local, settings.div_local,
SetConstraints(V), ComputeLocalMemSize(V));
printf("* Found %s%zu configuration(s)%s\n",
kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str());
diff --git a/src/tuning/tuning_api.cpp b/src/tuning/tuning_api.cpp
index 2eec2e2e..2cc9b786 100644
--- a/src/tuning/tuning_api.cpp
+++ b/src/tuning/tuning_api.cpp
@@ -264,7 +264,8 @@ StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V,
}
// Sets the tunable parameters and their possible values
- auto configurations = SetConfigurations(device, settings.parameters,
+ auto configurations = SetConfigurations(device, settings.parameters, settings.local_size,
+ settings.mul_local, settings.div_local,
SetConstraints(V), ComputeLocalMemSize(V));
// Select the search method (full search or a random fraction)