From fa84ac36f23c1aebb5facf946b21d8c0f1a4a46d Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 28 Jul 2018 16:01:03 +0200 Subject: The tuners now also check for valid local thread configurations and skip invalid ones completely, saving compilation time --- src/tuning/configurations.cpp | 49 ++++++++++++++++++++++++++++++++++++------- src/tuning/configurations.hpp | 17 +++++++++++++-- src/tuning/tuning.cpp | 3 ++- src/tuning/tuning_api.cpp | 3 ++- 4 files changed, 61 insertions(+), 11 deletions(-) (limited to 'src/tuning') diff --git a/src/tuning/configurations.cpp b/src/tuning/configurations.cpp index 1fe232cf..82d7e3b4 100644 --- a/src/tuning/configurations.cpp +++ b/src/tuning/configurations.cpp @@ -23,28 +23,42 @@ namespace clblast { // Finds all configurations. It also applies the user-defined constraints within. std::vector SetConfigurations(const Device& device, const std::vector parameters, + const std::vector& local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info) { const auto local_mem_max = device.LocalMemSize(); + const auto max_work_item_sizes = device.MaxWorkItemSizes(); + const auto max_work_group_size = device.MaxWorkGroupSize(); auto config = Configuration(); auto configurations = std::vector(); - PopulateConfigurations(parameters, 0, config, configurations, - local_mem_max, constraints, local_mem_size_info); + PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config, + 0, config, configurations, + local_mem_max, constraints, local_mem_size_info, + max_work_item_sizes, max_work_group_size); return configurations; } // Iterates recursively over all permutations of the user-defined parameters void PopulateConfigurations(const std::vector ¶meters, + const std::vector local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const size_t index, const Configuration &config, std::vector &configuration, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info) { + const LocalMemSizeInfo& local_mem_size_info, + const std::vector& max_work_item_sizes, + const size_t max_work_group_size) { // End of the chain: all parameters are considered, store the resulting configuration if it is a // valid one according to the constraints if (index == parameters.size()) { - if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info)) { + if (ValidConfiguration(config, local_mem_max, constraints, local_mem_size_info, + local_size_base, mul_local_config, div_local_config, + max_work_item_sizes, max_work_group_size)) { configuration.push_back(config); } return; @@ -55,8 +69,10 @@ void PopulateConfigurations(const std::vector ¶meters, for (auto &value: parameter.second) { auto config_copy = config; config_copy[parameter.first] = value; - PopulateConfigurations(parameters, index+1, config_copy, configuration, - local_mem_max, constraints, local_mem_size_info); + PopulateConfigurations(parameters, local_size_base, mul_local_config, div_local_config, + index+1, config_copy, configuration, + local_mem_max, constraints, local_mem_size_info, + max_work_item_sizes, max_work_group_size); } } @@ -64,7 +80,12 @@ void PopulateConfigurations(const std::vector ¶meters, bool ValidConfiguration(const Configuration &config, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info) { + const LocalMemSizeInfo& local_mem_size_info, + const std::vector local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, + const std::vector& max_work_item_sizes, + const size_t max_work_group_size) { // Iterates over all constraints for (auto &constraint: constraints) { @@ -92,6 +113,20 @@ bool ValidConfiguration(const Configuration &config, return false; } + // Checks the local thread size (both per dimension and in total) + const auto local = SetThreadConfiguration(config, local_size_base, + mul_local_config, div_local_config); + for (auto i=size_t{0}; i max_work_item_sizes[i]) { + return false; + } + } + auto local_size = size_t{1}; + for (auto &item: local) { local_size *= item; } + if (local_size > max_work_group_size) { + return false; + } + // Everything was OK: this configuration is valid return true; } diff --git a/src/tuning/configurations.hpp b/src/tuning/configurations.hpp index faa5498f..4b9ba93f 100644 --- a/src/tuning/configurations.hpp +++ b/src/tuning/configurations.hpp @@ -50,6 +50,9 @@ struct LocalMemSizeInfo { // function to find all configurations. It also applies the user-defined constraints within. std::vector SetConfigurations(const Device& device, const std::vector parameters, + const std::vector& local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const Constraints& constraints, const LocalMemSizeInfo& local_mem_size_info); @@ -58,11 +61,16 @@ std::vector SetConfigurations(const Device& device, // At the end of each chain (when all parameters are considered), the function stores the result // into the configuration list. void PopulateConfigurations(const std::vector ¶meters, + const std::vector local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, const size_t index, const Configuration &config, std::vector &configuration, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info); + const LocalMemSizeInfo& local_mem_size_info, + const std::vector& max_work_item_sizes, + const size_t max_work_group_size); // Loops over all user-defined constraints to check whether or not the configuration is valid. // Assumes initially all configurations are valid, then returns false if one of the constraints has @@ -71,7 +79,12 @@ void PopulateConfigurations(const std::vector ¶meters, bool ValidConfiguration(const Configuration &config, const size_t local_mem_max, const Constraints& constraints, - const LocalMemSizeInfo& local_mem_size_info); + const LocalMemSizeInfo& local_mem_size_info, + const std::vector local_size_base, + const TransformVector& mul_local_config, + const TransformVector& div_local_config, + const std::vector& max_work_item_sizes, + const size_t max_work_group_size); // Processes multipliers and dividers to obtain the final thread configuration std::vector SetThreadConfiguration(const Configuration& config, diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp index 822f8851..d382fb18 100644 --- a/src/tuning/tuning.cpp +++ b/src/tuning/tuning.cpp @@ -172,7 +172,8 @@ void Tuner(int argc, char* argv[], const int V, } // Sets the tunable parameters and their possible values - auto configurations = SetConfigurations(device, settings.parameters, + auto configurations = SetConfigurations(device, settings.parameters, settings.local_size, + settings.mul_local, settings.div_local, SetConstraints(V), ComputeLocalMemSize(V)); printf("* Found %s%zu configuration(s)%s\n", kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str()); diff --git a/src/tuning/tuning_api.cpp b/src/tuning/tuning_api.cpp index 2eec2e2e..2cc9b786 100644 --- a/src/tuning/tuning_api.cpp +++ b/src/tuning/tuning_api.cpp @@ -264,7 +264,8 @@ StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, } // Sets the tunable parameters and their possible values - auto configurations = SetConfigurations(device, settings.parameters, + auto configurations = SetConfigurations(device, settings.parameters, settings.local_size, + settings.mul_local, settings.div_local, SetConstraints(V), ComputeLocalMemSize(V)); // Select the search method (full search or a random fraction) -- cgit v1.2.3 From bc47e7e7cc675dcd57f8b492356b21964531ab98 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 28 Jul 2018 16:08:22 +0200 Subject: Added print statements to indicate the 4 stages of GEMM tuning --- src/tuning/kernels/xgemm.cpp | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src/tuning') diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 75e776e6..dd907ba4 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -33,9 +33,13 @@ void StartVariation(int argc, char *argv[]) { // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { + printf("* (1/4) Tuning main GEMM kernel (GEMMK == 0) for fixed set of parameters\n\n"); StartVariation<1>(argc, argv); + printf("* (2/4) Tuning main GEMM kernel (GEMMK == 0) for random parameters out of larger set\n\n"); StartVariation<2>(argc, argv); + printf("* (3/4) Tuning secondary GEMM kernel (GEMMK == 1) for fixed set of parameters\n\n"); StartVariation<11>(argc, argv); + printf("* (4/4) Tuning secondary GEMM kernel (GEMMK == 1) for random parameters out of larger set\n\n"); StartVariation<12>(argc, argv); return 0; } -- cgit v1.2.3 From 9bedaa752dc999224edf73eb5dd7e1c41662928f Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 15 Sep 2018 17:35:26 +0200 Subject: Fixed an MSVC compilation error due to large strings --- src/routines/level3/xgemm.cpp | 1 + src/routines/level3/xherk.cpp | 1 + src/routines/level3/xsyrk.cpp | 1 + src/routines/levelx/xgemmbatched.cpp | 1 + src/routines/levelx/xgemmstridedbatched.cpp | 1 + src/tuning/kernels/xgemm.hpp | 2 ++ 6 files changed, 7 insertions(+) (limited to 'src/tuning') diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index fd5a20db..cb24460a 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -40,6 +40,7 @@ Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" }) { diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index 6912d3a9..2e6f30ec 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -32,6 +32,7 @@ Xherk::Xherk(Queue &queue, EventPointer event, const std::string &name): , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" }) { diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index 6bb2a24f..5ffdc028 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -32,6 +32,7 @@ Xsyrk::Xsyrk(Queue &queue, EventPointer event, const std::string &name): , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" }) { diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp index 2bbc5007..b12b8734 100644 --- a/src/routines/levelx/xgemmbatched.cpp +++ b/src/routines/levelx/xgemmbatched.cpp @@ -38,6 +38,7 @@ XgemmBatched::XgemmBatched(Queue &queue, EventPointer event, const std::strin , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 diff --git a/src/routines/levelx/xgemmstridedbatched.cpp b/src/routines/levelx/xgemmstridedbatched.cpp index 30c161cc..d9e3ebba 100644 --- a/src/routines/levelx/xgemmstridedbatched.cpp +++ b/src/routines/levelx/xgemmstridedbatched.cpp @@ -37,6 +37,7 @@ XgemmStridedBatched::XgemmStridedBatched(Queue &queue, EventPointer event, co , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_part4.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 diff --git a/src/tuning/kernels/xgemm.hpp b/src/tuning/kernels/xgemm.hpp index 9a538c1b..fa1bb6ec 100644 --- a/src/tuning/kernels/xgemm.hpp +++ b/src/tuning/kernels/xgemm.hpp @@ -50,6 +50,8 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments &args) { settings.sources += #include "../src/kernels/level3/xgemm_part1.opencl" #include "../src/kernels/level3/xgemm_part2.opencl" + ; + settings.sources += #include "../src/kernels/level3/xgemm_part3.opencl" #include "../src/kernels/level3/xgemm_part4.opencl" ; -- cgit v1.2.3