diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-11-19 20:05:15 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-11-19 20:05:15 +0100 |
commit | da76d7ab81555452a1049eb1a6d130073427067d (patch) | |
tree | 92439d8bee44c34d63f288a73bdc372ba84dc42b /src | |
parent | c41d219ea42087c1b8d933b733b381005123cb91 (diff) | |
parent | defad3d1a249dd5f8c011cf28cc3c888d710d56a (diff) |
Merge pull request #216 from CNugteren/integrated_tuner
Integrated tuner
Diffstat (limited to 'src')
27 files changed, 1079 insertions, 549 deletions
diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 82fc44fd..0db64ad9 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -352,6 +352,13 @@ class Device { std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV)); } + // Retrieves the above extra information (if present) + std::string GetExtraInfo() const { + if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); } + if (HasExtension("cl_nv_device_attribute_query")) { return NVIDIAComputeCapability(); } + else { return std::string{""}; } + } + // Accessor to the private data-member const RawDeviceID& operator()() const { return device_; } private: diff --git a/src/cupp11.hpp b/src/cupp11.hpp index ec21c5b1..00337ebd 100644 --- a/src/cupp11.hpp +++ b/src/cupp11.hpp @@ -326,6 +326,9 @@ public: std::string AMDBoardName() const { return ""; } std::string NVIDIAComputeCapability() const { return Capabilities(); } + // Retrieves the above extra information + std::string GetExtraInfo() const { return NVIDIAComputeCapability(); } + // Accessor to the private data-member const RawDeviceID& operator()() const { return device_; } private: diff --git a/src/routine.cpp b/src/routine.cpp index 81201eea..93882fbf 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -135,74 +135,21 @@ void Routine::InitProgram(std::initializer_list<const char *> source) { throw RuntimeErrorCode(StatusCode::kNoHalfPrecision); } - // Collects the parameters for this device in the form of defines, and adds the precision + // Collects the parameters for this device in the form of defines auto source_string = std::string{""}; for (const auto &kernel_name : kernel_names_) { source_string += db_(kernel_name).GetDefines(); } - source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n"; - - // Adds the name of the routine as a define - source_string += "#define ROUTINE_"+routine_name_+"\n"; - - // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on - // which it is known to work with all OpenCL platforms. - if (device_.IsNVIDIA() || device_.IsARM()) { - source_string += "#define USE_INLINE_KEYWORD 1\n"; - } - - // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve - // performance, but might result in a reduced accuracy. - if (device_.IsAMD() && device_.IsGPU()) { - source_string += "#define USE_CL_MAD 1\n"; - } - - // For specific devices, use staggered/shuffled workgroup indices. - if (device_.IsAMD() && device_.IsGPU()) { - source_string += "#define USE_STAGGERED_INDICES 1\n"; - } - - // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize - // performance through better cache behaviour - if (device_.IsARM() && device_.IsGPU()) { - source_string += "#define GLOBAL_MEM_FENCE 1\n"; - } - - // Optionally adds a translation header from OpenCL kernels to CUDA kernels - #ifdef CUDA_API - source_string += - #include "kernels/opencl_to_cuda.h" - ; - #endif - - // Loads the common header (typedefs and defines and such) - source_string += - #include "kernels/common.opencl" - ; // Adds routine-specific code to the constructed source string for (const char *s: source) { source_string += s; } - // Prints details of the routine to compile in case of debugging in verbose mode - #ifdef VERBOSE - printf("[DEBUG] Compiling routine '%s-%s' for device '%s'\n", - routine_name_.c_str(), ToString(precision_).c_str(), device_name.c_str()); - const auto start_time = std::chrono::steady_clock::now(); - #endif + // Completes the source and compiles the kernel + program_ = CompileFromSource(source_string, precision_, routine_name_, + device_, context_, options); - // Compiles the kernel - program_ = Program(context_, source_string); - try { - program_.Build(device_, options); - } catch (const CLCudaAPIBuildError &e) { - if (program_.StatusIsCompilationWarningOrError(e.status())) { - fprintf(stdout, "OpenCL compiler error/warning: %s\n", - program_.GetBuildInfo(device_).c_str()); - } - throw; - } // Store the compiled binary and program in the cache BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name}, @@ -210,13 +157,6 @@ void Routine::InitProgram(std::initializer_list<const char *> source) { ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info}, Program{ program_ }); - - // Prints the elapsed compilation time in case of debugging in verbose mode - #ifdef VERBOSE - const auto elapsed_time = std::chrono::steady_clock::now() - start_time; - const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count(); - printf("[DEBUG] Completed compilation in %.2lf ms\n", timing); - #endif } // ================================================================================================= diff --git a/src/routines/common.hpp b/src/routines/common.hpp index bf3b1762..06d001d9 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -20,6 +20,7 @@ #include <vector> #include "utilities/utilities.hpp" +#include "utilities/compile.hpp" #include "database/database.hpp" namespace clblast { diff --git a/src/tuning/configurations.cpp b/src/tuning/configurations.cpp new file mode 100644 index 00000000..459d66b1 --- /dev/null +++ b/src/tuning/configurations.cpp @@ -0,0 +1,99 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune). +// This is only used for the optional tuner binaries and not part of the core of CLBlast. +// +// ================================================================================================= + +#include <vector> +#include <string> + +#include "tuning/configurations.hpp" + +namespace clblast { +// ================================================================================================= + +// Finds all configurations. It also applies the user-defined constraints within. +std::vector<Configuration> SetConfigurations(const std::vector<Parameter> parameters, + const Constraints& constraints) { + auto config = Configuration(); + auto configurations = std::vector<Configuration>(); + PopulateConfigurations(parameters, 0, config, configurations, constraints); + return configurations; +} + +// Iterates recursively over all permutations of the user-defined parameters +void PopulateConfigurations(const std::vector<Parameter> ¶meters, + const size_t index, const Configuration &config, + std::vector<Configuration> &configuration, + const Constraints& constraints) { + + // End of the chain: all parameters are considered, store the resulting configuration if it is a + // valid one according to the constraints + if (index == parameters.size()) { + if (ValidConfiguration(config, constraints)) { + configuration.push_back(config); + } + return; + } + + // This loop iterates over all values of the current parameter and calls this function recursively + Parameter parameter = parameters[index]; + for (auto &value: parameter.second) { + auto config_copy = config; + config_copy[parameter.first] = value; + PopulateConfigurations(parameters, index+1, config_copy, configuration, constraints); + } +} + +// Loops over all user-defined constraints to check whether or not the configuration is valid +bool ValidConfiguration(const Configuration &config, + const Constraints& constraints) { + + // Iterates over all constraints + for (auto &constraint: constraints) { + + // Finds the values of the parameters + auto values = std::vector<size_t>(constraint.parameters.size()); + for (auto i=size_t{0}; i<constraint.parameters.size(); ++i) { + values[i] = config.at(constraint.parameters[i]); + } + + // Checks this constraint for these values + if (!constraint.valid_if(values)) { + return false; + } + } + + // Everything was OK: this configuration is valid + return true; +} + +// Multiplies and/or dividers a thread configuration (local/global) +std::vector<size_t> SetThreadConfiguration(const Configuration& config, + const std::vector<size_t> base, + const TransformVector& mul_config, + const TransformVector& div_config) { + auto result = base; + for (const auto &multipliers: mul_config) { + for (auto i = size_t{0}; i < multipliers.size(); ++i) { + result[i] *= config.at(multipliers[i]); + } + } + for (const auto ÷rs: div_config) { + for (auto i = size_t{0}; i < dividers.size(); ++i) { + result[i] /= config.at(dividers[i]); + } + } + return result; +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/tuning/configurations.hpp b/src/tuning/configurations.hpp new file mode 100644 index 00000000..74679ff6 --- /dev/null +++ b/src/tuning/configurations.hpp @@ -0,0 +1,73 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune). +// This is only used for the optional tuner binaries and not part of the core of CLBlast. +// +// ================================================================================================= + +#ifndef CLBLAST_TUNING_CONFIGURATIONS_H_ +#define CLBLAST_TUNING_CONFIGURATIONS_H_ + +#include <vector> +#include <string> +#include <map> + +#include "utilities/utilities.hpp" + +namespace clblast { +// ================================================================================================= + +using Configuration = std::map<std::string, size_t>; +using Parameter = std::pair<std::string, std::vector<size_t>>; +using TransformVector = std::vector<std::vector<std::string>>; + +// Helper structure holding a constraint on parameters. This constraint consists of a constraint +// function object and a vector of parameter names represented as strings. +using ConstraintFunction = std::function<bool(std::vector<size_t>)>; +struct Constraint { + ConstraintFunction valid_if; + std::vector<std::string> parameters; +}; +using Constraints = std::vector<Constraint>; + +// ================================================================================================= + +// Initializes an empty configuration (vector of name/value pairs) and kicks-off the recursive +// function to find all configurations. It also applies the user-defined constraints within. +std::vector<Configuration> SetConfigurations(const std::vector<Parameter> parameters, + const Constraints& constraints); + +// Iterates recursively over all permutations of the user-defined parameters. This code creates +// multiple chains, in which each chain selects a unique combination of values for all parameters. +// At the end of each chain (when all parameters are considered), the function stores the result +// into the configuration list. +void PopulateConfigurations(const std::vector<Parameter> ¶meters, + const size_t index, const Configuration &config, + std::vector<Configuration> &configuration, + const Constraints& constraints); + +// Loops over all user-defined constraints to check whether or not the configuration is valid. +// Assumes initially all configurations are valid, then returns false if one of the constraints has +// not been met. Constraints consist of a user-defined function and a list of parameter names, which +// are replaced by parameter values in this function. +bool ValidConfiguration(const Configuration &config, + const Constraints& constraints); + +// Processes multipliers and dividers to obtain the final thread configuration +std::vector<size_t> SetThreadConfiguration(const Configuration& config, + const std::vector<size_t> base, + const TransformVector& mul_config, + const TransformVector& div_config); + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TUNING_CONFIGURATIONS_H_ +#endif diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp index 068c5f1b..462107d3 100644 --- a/src/tuning/kernels/copy_fast.cpp +++ b/src/tuning/kernels/copy_fast.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels. +// This file uses the auto-tuner to tune the copy OpenCL kernels. // // ================================================================================================= @@ -42,7 +42,6 @@ class TuneCopy { settings.kernel_family = "copy"; settings.kernel_name = "CopyMatrixFast"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/copy_fast.opencl" ; @@ -51,6 +50,10 @@ class TuneCopy { settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -78,20 +81,15 @@ class TuneCopy { // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { } + static std::vector<Constraint> SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, - std::vector<T> &, std::vector<T> &, - std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &, - std::vector<T> &) { - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); + static void SetArguments(Kernel &kernel, const Arguments<T> &args, + std::vector<Buffer<T>>& buffers) { + kernel.SetArgument(0, static_cast<int>(args.m)); + kernel.SetArgument(1, buffers[2]()); // 2 == A matrix + kernel.SetArgument(2, buffers[3]()); // 3 == B matrix + kernel.SetArgument(3, GetRealArg(args.alpha)); } }; diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp index 7102d05d..24557517 100644 --- a/src/tuning/kernels/copy_pad.cpp +++ b/src/tuning/kernels/copy_pad.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels. +// This file uses the auto-tuner to tune the pad OpenCL kernels. // // ================================================================================================= @@ -42,7 +42,6 @@ class TunePad { settings.kernel_family = "pad"; settings.kernel_name = "CopyPadMatrix"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/copy_pad.opencl" ; @@ -51,6 +50,10 @@ class TunePad { settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -78,28 +81,23 @@ class TunePad { // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { } + static std::vector<Constraint> SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, - std::vector<T> &, std::vector<T> &, - std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &, - std::vector<T> &) { - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentScalar(0); + static void SetArguments(Kernel &kernel, const Arguments<T> &args, + std::vector<Buffer<T>>& buffers) { + kernel.SetArgument(0, static_cast<int>(args.m)); + kernel.SetArgument(1, static_cast<int>(args.n)); + kernel.SetArgument(2, static_cast<int>(args.m)); + kernel.SetArgument(3, 0); + kernel.SetArgument(4, buffers[2]()); // 2 == A matrix + kernel.SetArgument(5, static_cast<int>(args.m)); + kernel.SetArgument(6, static_cast<int>(args.n)); + kernel.SetArgument(7, static_cast<int>(args.m)); + kernel.SetArgument(8, 0); + kernel.SetArgument(9, buffers[3]()); // 3 == B matrix + kernel.SetArgument(10, GetRealArg(args.alpha)); + kernel.SetArgument(11, 0); } }; diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp index 56726903..1e0d3c7b 100644 --- a/src/tuning/kernels/transpose_fast.cpp +++ b/src/tuning/kernels/transpose_fast.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels. +// This file uses the auto-tuner to tune the transpose OpenCL kernels. // // ================================================================================================= @@ -42,7 +42,6 @@ class TuneTranspose { settings.kernel_family = "transpose"; settings.kernel_name = "TransposeMatrixFast"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/transpose_fast.opencl" ; @@ -51,6 +50,10 @@ class TuneTranspose { settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -78,25 +81,15 @@ class TuneTranspose { // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { - auto LocalMemorySize = [args] (std::vector<size_t> v) { - return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"}); - } + static std::vector<Constraint> SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, - std::vector<T> &, std::vector<T> &, - std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &, - std::vector<T> &) { - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); + static void SetArguments(Kernel &kernel, const Arguments<T> &args, + std::vector<Buffer<T>>& buffers) { + kernel.SetArgument(0, static_cast<int>(args.m)); + kernel.SetArgument(1, buffers[2]()); // 2 == A matrix + kernel.SetArgument(2, buffers[3]()); // 3 == B matrix + kernel.SetArgument(3, GetRealArg(args.alpha)); } }; diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp index dc46e903..087f8e67 100644 --- a/src/tuning/kernels/transpose_pad.cpp +++ b/src/tuning/kernels/transpose_pad.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels. +// This file uses the auto-tuner to tune the pad-transpose OpenCL kernels. // // ================================================================================================= @@ -42,7 +42,6 @@ class TunePadTranspose { settings.kernel_family = "padtranspose"; settings.kernel_name = "TransposePadMatrix"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/transpose_pad.opencl" ; @@ -51,6 +50,10 @@ class TunePadTranspose { settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -77,33 +80,23 @@ class TunePadTranspose { // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { - auto LocalMemorySize = [args] (std::vector<size_t> v) { - return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"}); - } + static std::vector<Constraint> SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, - std::vector<T> &, std::vector<T> &, - std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &, - std::vector<T> &) { - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentScalar(0); + static void SetArguments(Kernel &kernel, const Arguments<T> &args, + std::vector<Buffer<T>>& buffers) { + kernel.SetArgument(0, static_cast<int>(args.m)); + kernel.SetArgument(1, static_cast<int>(args.n)); + kernel.SetArgument(2, static_cast<int>(args.m)); + kernel.SetArgument(3, 0); + kernel.SetArgument(4, buffers[2]()); // 2 == A matrix + kernel.SetArgument(5, static_cast<int>(args.n)); + kernel.SetArgument(6, static_cast<int>(args.m)); + kernel.SetArgument(7, static_cast<int>(args.n)); + kernel.SetArgument(8, 0); + kernel.SetArgument(9, buffers[3]()); // 3 == B matrix + kernel.SetArgument(10, GetRealArg(args.alpha)); + kernel.SetArgument(11, 0); } }; diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index e201949a..d843ea78 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the xaxpy OpenCL kernels. +// This file uses the auto-tuner to tune the xaxpy OpenCL kernels. // // ================================================================================================= @@ -41,7 +41,6 @@ class TuneXaxpy { settings.kernel_family = "xaxpy"; settings.kernel_name = "XaxpyFastest"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level1/level1.opencl" #include "../src/kernels/level1/xaxpy.opencl" ; @@ -50,6 +49,10 @@ class TuneXaxpy { settings.size_x = args.n; settings.size_y = args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1}; + settings.outputs = {1}; + // Sets the base thread configuration settings.global_size = {args.n}; settings.global_size_ref = settings.global_size; @@ -80,20 +83,15 @@ class TuneXaxpy { throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW"); } } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { } + static std::vector<Constraint> SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, - std::vector<T> &x_vec, std::vector<T> &y_vec, - std::vector<T> &, std::vector<T> &, std::vector<T> &, - std::vector<T> &) { - tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentOutput(y_vec); + static void SetArguments(Kernel &kernel, const Arguments<T> &args, + std::vector<Buffer<T>>& buffers) { + kernel.SetArgument(0, static_cast<int>(args.n)); + kernel.SetArgument(1, GetRealArg(args.alpha)); + kernel.SetArgument(2, buffers[0]()); // 0 == X vector + kernel.SetArgument(3, buffers[1]()); // 1 == Y vector } }; diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp index fb532680..12350657 100644 --- a/src/tuning/kernels/xdot.cpp +++ b/src/tuning/kernels/xdot.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the xdot OpenCL kernels. Note that the results are +// This file uses the auto-tuner to tune the xdot OpenCL kernels. Note that the results are // not verified, since the result is not final and depends on the WGS2 parameter. // // ================================================================================================= @@ -42,7 +42,6 @@ class TuneXdot { settings.kernel_family = "xdot_"+std::to_string(V); settings.kernel_name = (V==1) ? "Xdot" : "XdotEpilogue"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level1/xdot.opencl" ; @@ -51,6 +50,10 @@ class TuneXdot { settings.size_y = args.n; settings.size_temp = args.n; // Worst case + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1, 5}; + settings.outputs = {}; // no output checking + // Sets the base thread configuration settings.global_size = (V==1) ? std::vector<size_t>{2*64} : std::vector<size_t>{1}; settings.global_size_ref = (V==1) ? std::vector<size_t>{2*64*64} : std::vector<size_t>{64}; @@ -58,8 +61,8 @@ class TuneXdot { settings.local_size_ref = {64}; // Transforms the thread configuration based on the parameters - settings.mul_local = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}}; - settings.mul_global = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}}; + settings.mul_local = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; + settings.mul_global = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; // Sets the tuning parameters and their possible values settings.parameters = { @@ -75,31 +78,26 @@ class TuneXdot { // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { } + static std::vector<Constraint> SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, - std::vector<T> &x_vec, std::vector<T> &y_vec, - std::vector<T> &, std::vector<T> &, std::vector<T> &, - std::vector<T> &temp) { + static void SetArguments(Kernel &kernel, const Arguments<T> &args, + std::vector<Buffer<T>>& buffers) { if (V == 1) { - tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(y_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(temp); // No output checking for the result - size varies - tuner.AddArgumentScalar(static_cast<int>(false)); + kernel.SetArgument(0, static_cast<int>(args.n)); + kernel.SetArgument(1, buffers[0]()); // 0 == X vector + kernel.SetArgument(2, 0); + kernel.SetArgument(3, 1); + kernel.SetArgument(4, buffers[1]()); // 1 == Y vector + kernel.SetArgument(5, 0); + kernel.SetArgument(6, 1); + kernel.SetArgument(7, buffers[5]()); // 5 == temp; no output checking - size varies + kernel.SetArgument(8, static_cast<int>(false)); } else { - tuner.AddArgumentInput(temp); - tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere - tuner.AddArgumentScalar(0); + kernel.SetArgument(0, buffers[5]()); // 5 == temp + kernel.SetArgument(1, buffers[0]()); // 0 == X vector; no output checking - size varies + kernel.SetArgument(2, 0); } } }; diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 6dcdf68b..16e32988 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. There are two variations: +// This file uses the auto-tuner to tune the xgemm OpenCL kernels. There are two variations: // - V==1: This tests some limited set of tuning parameters exhaustively. // - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset. // @@ -38,7 +38,6 @@ class TuneXgemm { settings.default_k = 1024; settings.default_fraction = (V==1) ? 1.0 : 512.0; // test all or sample randomly settings.default_num_runs = 2; - settings.default_heuristic = static_cast<size_t>(cltune::SearchMethod::RandomSearch); return settings; } @@ -50,7 +49,6 @@ class TuneXgemm { settings.kernel_family = (V==1) ? "xgemm_1" : "xgemm_2"; settings.kernel_name = "Xgemm"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/xgemm_part1.opencl" #include "../src/kernels/level3/xgemm_part2.opencl" #include "../src/kernels/level3/xgemm_part3.opencl" @@ -61,6 +59,10 @@ class TuneXgemm { settings.size_b = args.n * args.k; settings.size_c = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3, 4}; + settings.outputs = {4}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -114,74 +116,51 @@ class TuneXgemm { settings.metric_amount = 2 * args.m * args.n * args.k; settings.performance_unit = "GFLOPS"; - // Returns which search heuristic to use - if (V==1) { settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); } - else { - // Use full-search to explore all parameter combinations or another strategy to search only a - // part of the parameter values. The fraction is set as a command-line argument. - if (args.fraction == 1.0 || args.fraction == 0.0) { - settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); - } else { - settings.heuristic = args.heuristic_selection; - } - } - return settings; } // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - - // Sets the constraints - static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + static std::vector<Constraint> SetConstraints() { + auto constraints = std::vector<Constraint>(); auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); }; auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; // Requirement for unrolling the KWG loop - tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"}); + constraints.push_back({MultipleOfX, {"KWG", "KWI"}}); // Required for integer MWI and NWI - tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}); + constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}}); + constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}}); // Required for integer MWIA and NWIB - tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}); + constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}}); + constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}}); // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}); - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}); + constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}}); + constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}}); // Extra constraints for variation 1 to limit the set of options significantly if (V==1) { auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; }; - tuner.AddConstraint(id, IsEqual, {"MDIMC", "MDIMA"}); - tuner.AddConstraint(id, IsEqual, {"NDIMC", "NDIMB"}); - tuner.AddConstraint(id, IsEqual, {"SA", "SB"}); + constraints.push_back({IsEqual, {"MDIMC", "MDIMA"}}); + constraints.push_back({IsEqual, {"NDIMC", "NDIMB"}}); + constraints.push_back({IsEqual, {"SA", "SB"}}); } - } - - // Sets the local memory size - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { - auto LocalMemorySize = [args] (std::vector<size_t> v) { - return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", - "SB", "KWG", "NWG"}); + return constraints; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, - std::vector<T> &, std::vector<T> &, - std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat, - std::vector<T> &) { - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentScalar(static_cast<int>(args.k)); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentScalar(GetRealArg(args.beta)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentInput(b_mat); - tuner.AddArgumentOutput(c_mat); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(0); + static void SetArguments(Kernel &kernel, const Arguments<T> &args, + std::vector<Buffer<T>>& buffers) { + kernel.SetArgument(0, static_cast<int>(args.m)); + kernel.SetArgument(1, static_cast<int>(args.n)); + kernel.SetArgument(2, static_cast<int>(args.k)); + kernel.SetArgument(3, GetRealArg(args.alpha)); + kernel.SetArgument(4, GetRealArg(args.beta)); + kernel.SetArgument(5, buffers[2]()); // 2 == A matrix + kernel.SetArgument(6, buffers[3]()); // 3 == B matrix + kernel.SetArgument(7, buffers[4]()); // 4 == C matrix + kernel.SetArgument(8, 0); + kernel.SetArgument(9, 0); } }; diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index 619fb37a..60a983b4 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations: +// This file uses the auto-tuner to tune the direct xgemm kernels. There are two variations: // - V==1: This tests some limited set of tuning parameters exhaustively. // - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset. // @@ -36,9 +36,8 @@ class TuneXgemmDirect { settings.default_m = 256; settings.default_n = 256; settings.default_k = 256; - settings.default_fraction = (V==1) ? 1.0 : 32.0; // test all or sample randomly + settings.default_fraction = (V==1) ? 1.0 : 64.0; // test all or sample randomly settings.default_num_runs = 4; - settings.default_heuristic = static_cast<size_t>(cltune::SearchMethod::RandomSearch); return settings; } @@ -50,7 +49,6 @@ class TuneXgemmDirect { settings.kernel_family = (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; settings.kernel_name = "XgemmDirectTN"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/xgemm_direct_part1.opencl" #include "../src/kernels/level3/xgemm_direct_part2.opencl" #include "../src/kernels/level3/xgemm_direct_part3.opencl" @@ -61,6 +59,10 @@ class TuneXgemmDirect { settings.size_b = args.n * args.k; settings.size_c = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3, 4}; + settings.outputs = {4}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -89,7 +91,7 @@ class TuneXgemmDirect { } else { // a lot more tuning parameters - has to be sampled randomly, too much to test all settings.parameters = { - {"WGD", {8, 16, 32, 64, 128}}, + {"WGD", {8, 16, 32, 64}}, {"MDIMCD", {8, 16, 32}}, {"NDIMCD", {8, 16, 32}}, {"MDIMAD", {8, 16, 32}}, @@ -106,79 +108,57 @@ class TuneXgemmDirect { settings.metric_amount = 2 * args.m * args.n * args.k; settings.performance_unit = "GFLOPS"; - // Returns which search heuristic to use - if (V==1) { settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); } - else { - // Use full-search to explore all parameter combinations or another strategy to search only a - // part of the parameter values. The fraction is set as a command-line argument. - if (args.fraction == 1.0 || args.fraction == 0.0) { - settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); - } else { - settings.heuristic = args.heuristic_selection; - } - } - return settings; } // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - - // Sets the constraints - static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + static std::vector<Constraint> SetConstraints() { + auto constraints = std::vector<Constraint>(); auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); }; auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; // Requirement for unrolling the WGD loop - tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"}); + constraints.push_back({MultipleOfX, {"WGD", "KWID"}}); // Required for integer MWID and NWID - tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}); + constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}}); + constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}}); // Required for integer MWIAD and NWIBD - tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}); + constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}}); + constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}}); // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...) - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}); - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}); + constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}}); + constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}}); // Extra constraints for variation 1 to limit the set of options significantly if (V==1) { auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; }; - tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"}); - tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"}); + constraints.push_back({IsEqual, {"MDIMCD", "MDIMAD"}}); + constraints.push_back({IsEqual, {"NDIMCD", "NDIMBD"}}); } - } - - // Sets the local memory size - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { - auto LocalMemorySize = [args] (std::vector<size_t> v) { - return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"}); + return constraints; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, - std::vector<T> &, std::vector<T> &, - std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat, - std::vector<T> &) { - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentScalar(static_cast<int>(args.k)); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentScalar(GetRealArg(args.beta)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(0); // a_offset - tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld - tuner.AddArgumentInput(b_mat); - tuner.AddArgumentScalar(0); // b_offset - tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld - tuner.AddArgumentOutput(c_mat); - tuner.AddArgumentScalar(0); // c_offset - tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld - tuner.AddArgumentScalar(1); // c_do_transpose - tuner.AddArgumentScalar(0); // a_conjugate - tuner.AddArgumentScalar(0); // b_conjugate + static void SetArguments(Kernel &kernel, const Arguments<T> &args, + std::vector<Buffer<T>>& buffers) { + kernel.SetArgument(0, static_cast<int>(args.m)); + kernel.SetArgument(1, static_cast<int>(args.n)); + kernel.SetArgument(2, static_cast<int>(args.k)); + kernel.SetArgument(3, GetRealArg(args.alpha)); + kernel.SetArgument(4, GetRealArg(args.beta)); + kernel.SetArgument(5, buffers[2]()); // 2 == A matrix + kernel.SetArgument(6, 0); // a_offset + kernel.SetArgument(7, static_cast<int>(args.k)); // a_ld + kernel.SetArgument(8, buffers[3]()); // 3 == B matrix + kernel.SetArgument(9, 0); // b_offset + kernel.SetArgument(10, static_cast<int>(args.n)); // b_ld + kernel.SetArgument(11, buffers[4]()); // 4 == C matrix + kernel.SetArgument(12, 0); // c_offset + kernel.SetArgument(13, static_cast<int>(args.n)); // c_ld + kernel.SetArgument(14, 1); // c_do_transpose + kernel.SetArgument(15, 0); // a_conjugate + kernel.SetArgument(16, 0); // b_conjugate } }; diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index e66b15f1..3eadd32b 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned: +// This file uses the auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned: // 1: The full version of the kernel // 2: The fast version for non-transposed matrices // 3: The fast version for transposed matrices @@ -45,7 +45,6 @@ class TuneXgemv { settings.kernel_family = (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); settings.kernel_name = (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level2/xgemv.opencl" #include "../src/kernels/level2/xgemv_fast.opencl" ; @@ -55,6 +54,10 @@ class TuneXgemv { settings.size_y = args.m; settings.size_a = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1, 2}; + settings.outputs = {1}; + // Sets the base thread configuration settings.global_size = {args.m}; settings.global_size_ref = settings.global_size; @@ -63,9 +66,7 @@ class TuneXgemv { // Transforms the thread configuration based on the parameters settings.mul_local = {{"WGS"+std::to_string(V)}}; - settings.div_global = (V==1 || V==2) ? - TunerSettings::TransformVector{{"WPT"+std::to_string(V)}} : - TunerSettings::TransformVector{}; + settings.div_global = (V==1 || V==2) ? TransformVector{{"WPT"+std::to_string(V)}} : TransformVector{}; // Sets the tuning parameters and their possible values if (V==1) { @@ -98,53 +99,41 @@ class TuneXgemv { // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + static std::vector<Constraint> SetConstraints() { + auto constraints = std::vector<Constraint>(); if (V==2 || V==3) { auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; - tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}); + constraints.push_back({MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}}); } if (V==3) { auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; }; - tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); - } - } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { - if (V==1 || V==2) { - auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)}); - } - else { - auto LocalMemorySize = [args] (std::vector<size_t> v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); + constraints.push_back({LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}}); } + return constraints; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, - std::vector<T> &x_vec, std::vector<T> &y_vec, - std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &, - std::vector<T> &) { + static void SetArguments(Kernel &kernel, const Arguments<T> &args, + std::vector<Buffer<T>>& buffers) { auto a_rotated = (V==3) ? 1 : 0; - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentScalar(GetRealArg(args.beta)); - tuner.AddArgumentScalar(static_cast<int>(a_rotated)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentOutput(y_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentScalar(0); // Conjugate transpose - tuner.AddArgumentScalar(0); // Additional parameter - tuner.AddArgumentScalar(0); // Banded 'kl' - tuner.AddArgumentScalar(0); // Banded 'ku' + kernel.SetArgument(0, static_cast<int>(args.m)); + kernel.SetArgument(1, static_cast<int>(args.n)); + kernel.SetArgument(2, GetRealArg(args.alpha)); + kernel.SetArgument(3, GetRealArg(args.beta)); + kernel.SetArgument(4, a_rotated); + kernel.SetArgument(5, buffers[2]()); // 2 == A matrix + kernel.SetArgument(6, 0); + kernel.SetArgument(7, static_cast<int>(args.m)); + kernel.SetArgument(8, buffers[0]()); // 0 == X vector + kernel.SetArgument(9, 0); + kernel.SetArgument(10, 1); + kernel.SetArgument(11, buffers[1]()); // 1 == Y vector + kernel.SetArgument(12, 0); + kernel.SetArgument(13, 1); + kernel.SetArgument(14, 0); // Conjugate transpose + kernel.SetArgument(15, 0); // Additional parameter + kernel.SetArgument(16, 0); // Banded 'kl' + kernel.SetArgument(17, 0); // Banded 'ku' } }; diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp index c2eb1d31..745e553f 100644 --- a/src/tuning/kernels/xger.cpp +++ b/src/tuning/kernels/xger.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels. +// This file uses the auto-tuner to tune the xger OpenCL kernels. // // ================================================================================================= @@ -42,7 +42,6 @@ class TuneXger { settings.kernel_family = "xger"; settings.kernel_name = "Xger"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level2/level2.opencl" #include "../src/kernels/level2/xger.opencl" ; @@ -52,6 +51,10 @@ class TuneXger { settings.size_y = args.n; settings.size_a = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1, 2}; + settings.outputs = {2}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -78,29 +81,24 @@ class TuneXger { // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { } + static std::vector<Constraint> SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, - std::vector<T> &x_vec, std::vector<T> &y_vec, - std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &, - std::vector<T> &) { - tuner.AddArgumentScalar(static_cast<int>(args.m)); - tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); // x_offset - tuner.AddArgumentScalar(1); // x_increment - tuner.AddArgumentInput(y_vec); - tuner.AddArgumentScalar(0); // y_offset - tuner.AddArgumentScalar(1); // y_increment - tuner.AddArgumentOutput(a_mat); - tuner.AddArgumentScalar(0); // a_offset - tuner.AddArgumentScalar(static_cast<int>(args.m)); // a_ld - tuner.AddArgumentScalar(0); // a_is_rowmajor + static void SetArguments(Kernel &kernel, const Arguments<T> &args, + std::vector<Buffer<T>>& buffers) { + kernel.SetArgument(0, static_cast<int>(args.m)); + kernel.SetArgument(1, static_cast<int>(args.n)); + kernel.SetArgument(2, GetRealArg(args.alpha)); + kernel.SetArgument(3, buffers[0]()); // 0 == X vector + kernel.SetArgument(4, 0); // x_offset + kernel.SetArgument(5, 1); // x_increment + kernel.SetArgument(6, buffers[1]()); // 1 == Y vector + kernel.SetArgument(7, 0); // y_offset + kernel.SetArgument(8, 1); // y_increment + kernel.SetArgument(9, buffers[2]()); // 2 == A matrix + kernel.SetArgument(10, 0); // a_offset + kernel.SetArgument(11, static_cast<int>(args.m)); // a_ld + kernel.SetArgument(12, 0); // a_is_rowmajor } }; diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp index a880c97e..cd22137a 100644 --- a/src/tuning/routines/xgemm.cpp +++ b/src/tuning/routines/xgemm.cpp @@ -18,7 +18,7 @@ #include <assert.h> #include "utilities/utilities.hpp" -#include "utilities/timing.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= @@ -68,7 +68,7 @@ void TuneXgemm(int argc, char* argv[]) { const auto platform = Platform(platform_id); const auto device = Device(platform, device_id); if (!PrecisionSupported<T>(device)) { - printf("* Unsupported precision, skipping this tuning run\n\n"); + printf("* Unsupported precision, skipping this tuning run\n"); return; } const auto context = Context(device); @@ -81,18 +81,18 @@ void TuneXgemm(int argc, char* argv[]) { auto buffers = std::vector<Buffer<T>>{a_mat, b_mat, c_mat}; // In-direct version - printf("[----------] Testing the in-direct GEMM routine for m=n=k\n"); + printf("\n* Testing the in-direct GEMM routine for m=n=k\n"); ForceSelectIndirectFrom<T>(0, device); const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>); // Direct version - printf("[----------] Testing the direct GEMM routine for m=n=k\n"); + printf("\n* Testing the direct GEMM routine for m=n=k\n"); ForceSelectIndirectFrom<T>(to * to * to + 1, device); const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>); // Determining final score and best kernel selection point assert(indirect.size() == direct.size()); - printf("[----------] Collecting results\n"); + printf("\n* Collecting results\n"); auto ratios = std::vector<double>(indirect.size()); for (auto i = size_t{0}; i < indirect.size(); ++i) { ratios[i] = indirect[i].second / direct[i].second; @@ -104,42 +104,55 @@ void TuneXgemm(int argc, char* argv[]) { for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); } const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1); + auto tuning_results = Configuration(); + tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = indirect[i].first; + tuning_results["PRECISION"] = static_cast<size_t>(precision); scores[i] = TuningResult{ "gemm_kernel_selection", (relative_score * relative_score) * 100 + epsilon, // squared for proper default computation - TuningParameters{ - TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first}, - TuningParameter{"PRECISION", static_cast<size_t>(precision)} - } + tuning_results }; } // Displaying results - printf("[ -------> ] value indirect direct score (lowest means best switching point)\n"); + printf("| value | indirect | direct | score | (lowest score == best switching point)\n"); + printf("x---------x-------------x-------------x----------x\n"); for (auto i = size_t{0}; i < indirect.size(); ++i) { assert(indirect[i].first == direct[i].first); const auto value = indirect[i].first; if (indirect[i].second != -1 && direct[i].second != -1) { const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6); const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6); - printf("[ -------> ] %7zu %8.2lf %8.2lf %8.2lf\n", + printf("| %7zu | %8.2lf ms | %8.2lf ms | %8.3lf |\n", value, gflops_indirect, gflops_direct, scores[i].score); } } + printf("x---------x-------------x-------------x----------x\n"); + printf("\n"); + + // Computes the best switching point + auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; + const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison); + const auto best_switching_point = best_configuration->config["XGEMM_MIN_INDIRECT_SIZE"]; + const auto best_string = "XGEMM_MIN_INDIRECT_SIZE=" + ToString(best_switching_point); // Outputs the results as JSON to disk, including some meta-data const auto precision_string = std::to_string(static_cast<size_t>(precision)); auto metadata = std::vector<std::pair<std::string,std::string>>{ {"kernel_family", "gemm_routine"}, + {"precision", precision_string}, {"arg_from", ToString(from)}, {"arg_to", ToString(to)}, {"arg_step", ToString(step)}, - {"precision", precision_string}, + {"best_kernel", best_configuration->name}, + {"best_time", ToString(best_configuration->score)}, + {"best_parameters", best_string} }; PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json", device, platform, metadata, scores); - printf("[ STATUS ] All done\n"); + printf("* Completed tuning process\n"); + printf("\n"); } // ================================================================================================= diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp new file mode 100644 index 00000000..0af17a6f --- /dev/null +++ b/src/tuning/tuning.cpp @@ -0,0 +1,88 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the generic CLBlast auto-tuner (inspired by CLTune). This is only used for +// the optional and stand-alone tuner binaries and not part of the core of CLBlast. +// +// ================================================================================================= + +#include <vector> +#include <string> +#include <random> +#include <utility> +#include <algorithm> +#include <iostream> + +#include "utilities/utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +void PrintTimingsToFileAsJSON(const std::string &filename, + const Device& device, const Platform& platform, + const std::vector<std::pair<std::string,std::string>> &metadata, + const std::vector<TuningResult>& tuning_results) { + auto num_results = tuning_results.size(); + printf("* Writing a total of %zu results to '%s'\n", num_results, filename.c_str()); + + auto file = fopen(filename.c_str(), "w"); + fprintf(file, "{\n"); + for (auto &datum: metadata) { + fprintf(file, " \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str()); + } + fprintf(file, " \"clblast_device_type\": \"%s\",\n", device.Type().c_str()); + fprintf(file, " \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str()); + fprintf(file, " \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str()); + fprintf(file, " \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str()); + fprintf(file, " \"device\": \"%s\",\n", device.Name().c_str()); + fprintf(file, " \"platform_version\": \"%s\",\n", platform.Version().c_str()); + fprintf(file, " \"device_vendor\": \"%s\",\n", platform.Vendor().c_str()); + fprintf(file, " \"device_type\": \"%s\",\n", device.Type().c_str()); + fprintf(file, " \"device_core_clock\": \"%zu\",\n", device.CoreClock()); + fprintf(file, " \"device_compute_units\": \"%zu\",\n", device.ComputeUnits()); + fprintf(file, " \"device_extra_info\": \"%s\",\n", device.GetExtraInfo().c_str()); + fprintf(file, " \"results\": [\n"); + + // Loops over all results + for (auto r = size_t{0}; r < num_results; ++r) { + auto result = tuning_results[r]; + fprintf(file, " {\n"); + fprintf(file, " \"kernel\": \"%s\",\n", result.name.c_str()); + fprintf(file, " \"time\": %.3lf,\n", result.score); + + // Loops over all the parameters for this result + fprintf(file, " \"parameters\": {"); + auto num_configs = result.config.size(); + auto p = size_t{0}; + for (const auto parameter : result.config) { + fprintf(file, "\"%s\": %zu", parameter.first.c_str(), parameter.second); + if (p < num_configs -1 ) { fprintf(file, ","); } + ++p; + } + fprintf(file, "}\n"); + + // The footer + fprintf(file, " }"); + if (r < num_results - 1) { fprintf(file, ","); } + fprintf(file, "\n"); + } + fprintf(file, " ]\n"); + fprintf(file, "}\n"); + fclose(file); +} + +void print_separator(const size_t parameters_size) { + printf("x------x-------x"); + for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); } + printf("-x----------------x--------------x--------x-------------------x\n"); +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index bc9c0e03..2c7f6a0b 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -7,26 +7,45 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file implements the interface to the CLTune auto-tuner. This is only used for the optional -// and stand-alone tuner binaries and not part of the core of CLBlast. +// This file implements the generic CLBlast auto-tuner (inspired by CLTune). This is only used for +// the optional and stand-alone tuner binaries and not part of the core of CLBlast. // // ================================================================================================= -#ifndef CLBLAST_TUNING_H_ -#define CLBLAST_TUNING_H_ +#ifndef CLBLAST_TUNING_TUNING_H_ +#define CLBLAST_TUNING_TUNING_H_ #include <vector> #include <string> #include <random> #include <utility> - -#include <cltune.h> +#include <algorithm> +#include <iostream> +#include <chrono> #include "utilities/utilities.hpp" +#include "utilities/compile.hpp" +#include "utilities/timing.hpp" +#include "tuning/configurations.hpp" namespace clblast { // ================================================================================================= +// Constants holding start and end strings for terminal-output in colour +#if defined(_WIN32) + const std::string kPrintError = ""; + const std::string kPrintSuccess = ""; + const std::string kPrintMessage = ""; + const std::string kPrintEnd = ""; +#else + const std::string kPrintError = "\x1b[31m"; + const std::string kPrintSuccess = "\x1b[32m"; + const std::string kPrintMessage = "\x1b[1m"; + const std::string kPrintEnd = "\x1b[0m"; +#endif + +// ================================================================================================= + // Structures for the tuners with all the default settings struct TunerDefaults { @@ -41,15 +60,7 @@ struct TunerDefaults { // Other defaults size_t default_batch_count = 1; size_t default_num_runs = 10; // run every kernel this many times for averaging - - // Search heuristic defaults double default_fraction = 1.0; - size_t default_swarm_size_PSO = 8; - double default_influence_global_PSO = 0.1; - double default_influence_local_PSO = 0.3; - double default_influence_random_PSO = 0.6; - size_t default_heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); - double default_max_temp_ann = 1.0; }; // Structures for the tuners with the remaining settings @@ -68,6 +79,10 @@ struct TunerSettings { size_t size_c = 1; size_t size_temp = 1; + // Inputs and outputs (X:0, Y:1, A:2, B:3, C:4, temp:5) + std::vector<size_t> inputs = {}; + std::vector<size_t> outputs = {}; + // Sets the base thread configuration std::vector<size_t> global_size = {}; std::vector<size_t> global_size_ref = {}; @@ -75,25 +90,32 @@ struct TunerSettings { std::vector<size_t> local_size_ref = {}; // Transforms the thread configuration based on the parameters - using TransformVector = std::vector<std::vector<std::string>>; TransformVector mul_local = {}; TransformVector div_local = {}; TransformVector mul_global = {}; TransformVector div_global = {}; // Sets the tuning parameters and their possible values - std::vector<std::pair<std::string, std::vector<size_t>>> parameters; + std::vector<Parameter> parameters; // Describes how to compute the performance metrics size_t metric_amount = 0; std::string performance_unit = "N/A"; - - // Returns which search heuristic to use - size_t heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); }; // ================================================================================================= +struct TuningResult { std::string name; double score; Configuration config; }; + +void PrintTimingsToFileAsJSON(const std::string &filename, + const Device& device, const Platform& platform, + const std::vector<std::pair<std::string,std::string>> &metadata, + const std::vector<TuningResult>& tuning_results); + +void print_separator(const size_t parameters_size); + +// ================================================================================================= + // Function to get command-line argument, set-up the input buffers, configure the tuner, and collect // the results. Used for all types of kernel families. Note that this is a header-only function so // that it is automatically compiled for the various kernels (given as the 'C' template argument). @@ -115,147 +137,266 @@ void Tuner(int argc, char* argv[]) { if (o == kArgK) { args.k = GetArgument(command_line_args, help, kArgK, defaults.default_k); } if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar<T>()); } if (o == kArgBeta) { args.beta = GetArgument(command_line_args, help, kArgBeta, GetScalar<T>()); } - if (o == kArgFraction) { args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction); } if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, defaults.default_batch_count); } - if (o == kArgHeuristicSelection) {args.heuristic_selection = GetArgument(command_line_args, help, kArgHeuristicSelection, defaults.default_heuristic); } - if (o == kArgPsoSwarmSize) {args.pso_swarm_size = GetArgument(command_line_args, help, kArgPsoSwarmSize , defaults.default_swarm_size_PSO); } - if (o == kArgPsoInfGlobal) {args.pso_inf_global = GetArgument(command_line_args, help, kArgPsoInfGlobal, defaults.default_influence_global_PSO); } - if (o == kArgPsoInfLocal) {args.pso_inf_local = GetArgument(command_line_args, help, kArgPsoInfLocal, defaults.default_influence_local_PSO); } - if (o == kArgPsoInfRandom) {args.pso_inf_random = GetArgument(command_line_args, help, kArgPsoInfRandom, defaults.default_influence_random_PSO); } - if (o == kArgAnnMaxTemp) {args.ann_max_temperature = GetArgument(command_line_args, help, kArgAnnMaxTemp, defaults.default_max_temp_ann); } } - const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs); - fprintf(stdout, "%s\n", help.c_str()); + args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction); + args.num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs); + const auto max_l2_norm = GetArgument(command_line_args, help, kArgMaxL2Norm, 1.0e-4); + printf("%s\n", help.c_str()); const TunerSettings settings = C::GetTunerSettings(args); // Tests validity of the given arguments C::TestValidArguments(args); + // Initializes OpenCL + const auto platform = Platform(args.platform_id); + const auto device = Device(platform, args.device_id); + const auto context = Context(device); + auto queue = Queue(context, device); + // Tests for validity of the precision and retrieves properties - auto isAMD = false; - auto isARM = false; - auto isGPU = false; - auto device_type = std::string{}; - auto device_vendor = std::string{}; - auto device_architecture = std::string{}; - auto device_name = std::string{}; - { // In a block such that the platform and the device are destroyed before initializing the tuner - const auto platform = Platform(args.platform_id); - const auto device = Device(platform, args.device_id); - if (!PrecisionSupported<T>(device)) { - printf("* Unsupported precision, skipping this tuning run\n\n"); - return; - } - isAMD = device.IsAMD(); - isARM = device.IsARM(); - isGPU = device.IsGPU(); - device_type = GetDeviceType(device); - device_vendor = GetDeviceVendor(device); - device_architecture = GetDeviceArchitecture(device); - device_name = GetDeviceName(device); + if (!PrecisionSupported<T>(device)) { + printf("* Unsupported precision, skipping this tuning run\n\n"); + return; } + const auto device_type = GetDeviceType(device); + const auto device_vendor = GetDeviceVendor(device); + const auto device_architecture = GetDeviceArchitecture(device); + const auto device_name = GetDeviceName(device); // Creates input buffers with random data - auto x_vec = std::vector<T>(settings.size_x); - auto y_vec = std::vector<T>(settings.size_y); - auto a_mat = std::vector<T>(settings.size_a); - auto b_mat = std::vector<T>(settings.size_b); - auto c_mat = std::vector<T>(settings.size_c); - auto temp = std::vector<T>(settings.size_temp); + const auto buffer_sizes = std::vector<size_t>{ + settings.size_x, settings.size_y, + settings.size_a, settings.size_b, settings.size_c, + settings.size_temp + }; std::mt19937 mt(kSeed); std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit); - PopulateVector(x_vec, mt, dist); - PopulateVector(y_vec, mt, dist); - PopulateVector(a_mat, mt, dist); - PopulateVector(b_mat, mt, dist); - PopulateVector(c_mat, mt, dist); - PopulateVector(temp, mt, dist); - - // Initializes the tuner for the chosen device - cltune::Tuner tuner(args.platform_id, args.device_id); - - // Select the search method based on the command-line arguments - // If the tuner does not support the selected choice, full search will be returned. - auto method = settings.heuristic; - if (method == 1) { tuner.UseRandomSearch(1.0/args.fraction); } - else if (method == 2) { tuner.UseAnnealing(1.0/args.fraction, args.ann_max_temperature); } - else if (method == 3) { tuner.UsePSO(1.0/args.fraction, args.pso_swarm_size, args.pso_inf_global, - args.pso_inf_local, args.pso_inf_random); } - else { tuner.UseFullSearch(); } - - // Set extra settings for specific defines. This mimics src/routine.cc. - auto defines = std::string{""}; - if (isAMD && isGPU) { - defines += "#define USE_CL_MAD 1\n"; - defines += "#define USE_STAGGERED_INDICES 1\n"; + auto source_buffers = std::vector<std::vector<T>>(); + auto reference_buffers = std::vector<std::vector<T>>(); + auto result_buffers = std::vector<std::vector<T>>(); + auto device_buffers = std::vector<Buffer<T>>(); + for (const auto size : buffer_sizes) { + auto host_buffer = std::vector<T>(size); + PopulateVector(host_buffer, mt, dist); + source_buffers.push_back(host_buffer); + auto reference_buffer = std::vector<T>(size); + reference_buffers.push_back(reference_buffer); + auto result_buffer = std::vector<T>(size); + result_buffers.push_back(result_buffer); + auto device_buffer = Buffer<T>(context, size); + device_buffers.push_back(device_buffer); } - if (isARM && isGPU) { - defines += "#define GLOBAL_MEM_FENCE 1\n"; - } - - // Loads the kernel sources and defines the kernel to tune - auto sources = defines + settings.sources; - auto id = tuner.AddKernelFromString(sources, settings.kernel_name, settings.global_size, settings.local_size); - tuner.SetReferenceFromString(sources, settings.kernel_name, settings.global_size_ref, settings.local_size_ref); // Sets the tunable parameters and their possible values - for (const auto ¶meter: settings.parameters) { - tuner.AddParameter(id, parameter.first, parameter.second); + auto configurations = SetConfigurations(settings.parameters, C::SetConstraints()); + printf("* Found %s%zu configuration(s)%s\n", + kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str()); + + // Select the search method (full search or a random fraction) + if (args.fraction != 0.0 && args.fraction != 1.0) { + const auto new_size = static_cast<size_t>(configurations.size() / args.fraction); + auto rng = std::default_random_engine{}; + std::shuffle(std::begin(configurations), std::end(configurations), rng); + configurations.resize(new_size); + printf("* Exploring a random subset of %s%zu configuration(s)%s\n", + kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str()); } - C::SetConstraints(tuner, id); - C::SetLocalMemorySize(tuner, id, args); - // Tests for a specific precision - tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); - tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); + // Prints information about the parameters + printf("* Parameters explored: "); + for (const auto& parameter : settings.parameters) { printf("%s ", parameter.first.c_str()); } + printf("\n"); + + // Prints the header of the table + printf("\n"); + printf("| ID | total |"); + for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); } + printf("param | compiles | time | %6s | status |\n", settings.performance_unit.c_str()); + print_separator(settings.parameters.size()); + + // First runs a reference example to compare against + try { + printf("| ref | - |"); + for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); } + printf(" - |"); - // Modifies the thread-sizes (both global and local) based on the parameters - for (auto ¶meters: settings.mul_local) { tuner.MulLocalSize(id, parameters); } - for (auto ¶meters: settings.div_local) { tuner.DivLocalSize(id, parameters); } - for (auto ¶meters: settings.mul_global) { tuner.MulGlobalSize(id, parameters); } - for (auto ¶meters: settings.div_global) { tuner.DivGlobalSize(id, parameters); } - // Sets the function's arguments - C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp); + // Sets the input + for (const auto id : settings.inputs) { + device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); + } + + // Compiles the kernel + auto compiler_options = std::vector<std::string>(); + const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name, + device, context, compiler_options); + auto kernel = Kernel(program, settings.kernel_name); + C::SetArguments(kernel, args, device_buffers); + printf(" %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str()); + + // Runs the kernel + const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, + settings.global_size_ref, settings.local_size_ref); + printf(" - |"); + if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); } + + // Saves the result + for (const auto id : settings.outputs) { + device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]); + } + printf(" %sreference OK%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str()); + } + catch (...) { + const auto status_code = DispatchExceptionCatchAll(true); + printf("* Exception caught with status %d while running the reference, aborting\n", + static_cast<int>(status_code)); + return; + } + print_separator(settings.parameters.size()); // Starts the tuning process - tuner.SetNumRuns(num_runs); - tuner.Tune(); + auto results = std::vector<TuningResult>(); + for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) { + try { + + auto configuration = configurations[config_id]; + printf("| %4zu | %5zu |", config_id + 1, configurations.size()); + for (const auto& parameter : settings.parameters) { + printf("%5zu", configuration.at(parameter.first)); + } + printf(" |"); + + // Sets the input + for (const auto id : settings.inputs) { + device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); + } + + // Sets the thread configuration + const auto global = SetThreadConfiguration(configuration, settings.global_size, + settings.mul_global, settings.div_global); + const auto local = SetThreadConfiguration(configuration, settings.local_size, + settings.mul_local, settings.div_local); + + // Sets the parameters for this configuration + auto kernel_source = std::string{""}; + for (const auto ¶meter : configuration) { + kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n"; + } + kernel_source += settings.sources; + + // Compiles the kernel + const auto start_time = std::chrono::steady_clock::now(); + auto compiler_options = std::vector<std::string>(); + const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name, + device, context, compiler_options, true); + auto kernel = Kernel(program, settings.kernel_name); + const auto elapsed_time = std::chrono::steady_clock::now() - start_time; + const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count(); + printf(" %sOK%s %5.0lf ms |", kPrintSuccess.c_str(), kPrintEnd.c_str(), timing); + + // Runs the kernel + C::SetArguments(kernel, args, device_buffers); + const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local); + + // Kernel run was not successful + if (time_ms == -1.0) { + printf(" - |"); + printf(" %sinvalid config.%s |", kPrintError.c_str(), kPrintEnd.c_str()); + printf(" <-- skipping\n"); + continue; + } + + // Compares the results + auto l2_error = 0.0; + for (const auto id : settings.outputs) { + device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]); + for (auto index = size_t{0}; index<buffer_sizes[id]; ++index) { + const auto diff = SquaredDifference(result_buffers[id][index], reference_buffers[id][index]); + l2_error += diff; + } + l2_error /= static_cast<double>(buffer_sizes[id]); + if (std::isnan(l2_error) || l2_error > max_l2_norm) { + printf(" - |"); + printf(" %sL2 error %8.2e%s |", kPrintError.c_str(), l2_error, kPrintEnd.c_str()); + throw std::runtime_error("L2 error too large"); + } + } + + // All was OK + configuration["PRECISION"] = static_cast<size_t>(args.precision); + results.push_back(TuningResult{settings.kernel_name, time_ms, configuration}); + printf(" %6.1lf |", settings.metric_amount / (time_ms * 1.0e6)); + printf(" %sresults match%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str()); + } + catch (const CLCudaAPIBuildError &e) { + const auto status_code = DispatchExceptionCatchAll(true); + printf(" %scompilation error: %5d%s |", + kPrintError.c_str(), static_cast<int>(status_code), kPrintEnd.c_str()); + printf(" - | - | <-- skipping\n"); + } + catch (...) { + const auto status_code = DispatchExceptionCatchAll(true); + if (status_code != StatusCode::kUnknownError) { + printf(" %serror code %d%s |", + kPrintError.c_str(), static_cast<int>(status_code), kPrintEnd.c_str()); + } + printf(" <-- skipping\n"); + } + } + + // Completed the tuning process + print_separator(settings.parameters.size()); + printf("\n"); + if (results.size() == 0) { return; } - // Prints the results to screen - auto time_ms = tuner.PrintToScreen(); - tuner.PrintFormatted(); + // Computes the best results + auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; + const auto best_configuration = std::min_element(results.begin(), results.end(), comparison); + const auto best_time_ms = best_configuration->score; + if (best_time_ms == 0.0) { return; } // Also prints the performance of the best-case in terms of GB/s or GFLOPS - if (time_ms != 0.0) { - printf("[ -------> ] %.2lf ms", time_ms); - printf(" or %.1lf %s\n", settings.metric_amount/(time_ms*1.0e6), settings.performance_unit.c_str()); + printf("\n"); + printf("* Found best result %.2lf ms", best_time_ms); + printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6), + settings.performance_unit.c_str()); + printf("* Best parameters: "); + auto best_string = std::string{""}; + auto i = size_t{0}; + for (const auto config : best_configuration->config) { + best_string += "" + config.first + "=" + ToString(config.second); + if (i < best_configuration->config.size() - 1) { best_string += " "; } + ++i; } + printf("%s\n\n", best_string.c_str()); // Outputs the results as JSON to disk, including some meta-data auto precision_string = std::to_string(static_cast<size_t>(args.precision)); auto metadata = std::vector<std::pair<std::string,std::string>>{ {"kernel_family", settings.kernel_family}, {"precision", precision_string}, - {"clblast_device_type", device_type}, - {"clblast_device_vendor", device_vendor}, - {"clblast_device_architecture", device_architecture}, - {"clblast_device_name", device_name} + {"best_kernel", best_configuration->name}, + {"best_time", ToString(best_configuration->score)}, + {"best_parameters", best_string} }; for (auto &o: defaults.options) { - if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); } - if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); } - if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); } + if (o == kArgM) { metadata.push_back({"arg_m", ToString(args.m)}); } + if (o == kArgN) { metadata.push_back({"arg_n", ToString(args.n)}); } + if (o == kArgK) { metadata.push_back({"arg_k", ToString(args.k)}); } if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); } if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); } if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); } } - tuner.PrintJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json", metadata); - + PrintTimingsToFileAsJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json", + device, platform, metadata, results); + + printf("* Completed tuning process\n"); + printf("\n"); } // ================================================================================================= } // namespace clblast -// CLBLAST_TUNING_H_ +// CLBLAST_TUNING_TUNING_H_ #endif diff --git a/src/utilities/clblast_exceptions.cpp b/src/utilities/clblast_exceptions.cpp index 32526215..25e5f4be 100644 --- a/src/utilities/clblast_exceptions.cpp +++ b/src/utilities/clblast_exceptions.cpp @@ -45,7 +45,7 @@ RuntimeErrorCode::RuntimeErrorCode(StatusCode status, const std::string &subreas // ================================================================================================= -StatusCode DispatchException() +StatusCode DispatchException(const bool silent) { const char *message = nullptr; StatusCode status; @@ -66,12 +66,41 @@ StatusCode DispatchException() status = StatusCode::kUnknownError; } - if (message) { + if (message && !silent) { fprintf(stderr, "CLBlast: %s\n", message); } return status; } +StatusCode DispatchExceptionCatchAll(const bool silent) +{ + const char *message = nullptr; + StatusCode status; + + try { + throw; + } catch (BLASError &e) { + // no message is printed for invalid argument errors + status = e.status(); + } catch (CLCudaAPIError &e) { + message = e.what(); + status = static_cast<StatusCode>(e.status()); + } catch (RuntimeErrorCode &e) { + message = e.what(); + status = e.status(); + } catch (Error<std::runtime_error> &e) { + message = e.what(); + status = StatusCode::kUnknownError; + } catch (...) { + message = "unknown exception type"; + status = StatusCode::kUnknownError; + } + + if (message && !silent) { + fprintf(stderr, "CLBlast: %s\n", message); + } + return status; +} // ================================================================================================= StatusCode DispatchExceptionForC() diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp index a790be9c..9bd38187 100644 --- a/src/utilities/clblast_exceptions.hpp +++ b/src/utilities/clblast_exceptions.hpp @@ -37,7 +37,8 @@ class RuntimeErrorCode : public ErrorCode<RuntimeError, StatusCode> { // ================================================================================================= // Handles (most of the) runtime exceptions and converts them to StatusCode -StatusCode DispatchException(); +StatusCode DispatchException(const bool silent = false); +StatusCode DispatchExceptionCatchAll(const bool silent = false); // Handles remaining exceptions and converts them to StatusCode::kUnhandledError StatusCode DispatchExceptionForC(); diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp new file mode 100644 index 00000000..2a55506e --- /dev/null +++ b/src/utilities/compile.cpp @@ -0,0 +1,99 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the kernel compilation functions (see the header for more information). +// +// ================================================================================================= + +#include <vector> +#include <chrono> + +#include "routines/common.hpp" + +namespace clblast { +// ================================================================================================= + +// Compiles a program from source code +Program CompileFromSource(const std::string &source_string, const Precision precision, + const std::string &routine_name, + const Device& device, const Context& context, + std::vector<std::string>& options, const bool silent) { + auto header_string = std::string{""}; + + header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n"; + + // Adds the name of the routine as a define + header_string += "#define ROUTINE_" + routine_name + "\n"; + + // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on + // which it is known to work with all OpenCL platforms. + if (device.IsNVIDIA() || device.IsARM()) { + header_string += "#define USE_INLINE_KEYWORD 1\n"; + } + + // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve + // performance, but might result in a reduced accuracy. + if (device.IsAMD() && device.IsGPU()) { + header_string += "#define USE_CL_MAD 1\n"; + } + + // For specific devices, use staggered/shuffled workgroup indices. + if (device.IsAMD() && device.IsGPU()) { + header_string += "#define USE_STAGGERED_INDICES 1\n"; + } + + // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize + // performance through better cache behaviour + if (device.IsARM() && device.IsGPU()) { + header_string += "#define GLOBAL_MEM_FENCE 1\n"; + } + + // Optionally adds a translation header from OpenCL kernels to CUDA kernels + #ifdef CUDA_API + source_string += + #include "kernels/opencl_to_cuda.h" + ; + #endif + + // Loads the common header (typedefs and defines and such) + header_string += + #include "kernels/common.opencl" + ; + + // Prints details of the routine to compile in case of debugging in verbose mode + #ifdef VERBOSE + printf("[DEBUG] Compiling routine '%s-%s'\n", + routine_name.c_str(), ToString(precision).c_str()); + const auto start_time = std::chrono::steady_clock::now(); + #endif + + // Compiles the kernel + auto program = Program(context, header_string + source_string); + try { + program.Build(device, options); + } catch (const CLCudaAPIBuildError &e) { + if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) { + fprintf(stdout, "OpenCL compiler error/warning:\n%s\n", + program.GetBuildInfo(device).c_str()); + } + throw; + } + + // Prints the elapsed compilation time in case of debugging in verbose mode + #ifdef VERBOSE + const auto elapsed_time = std::chrono::steady_clock::now() - start_time; + const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count(); + printf("[DEBUG] Completed compilation in %.2lf ms\n", timing); + #endif + + return program; +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp new file mode 100644 index 00000000..0315d70c --- /dev/null +++ b/src/utilities/compile.hpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the CLBlast way to compile a kernel from source, used for the library and for +// the auto-tuners. +// +// ================================================================================================= + +#ifndef CLBLAST_UTILITIES_COMPILE_H_ +#define CLBLAST_UTILITIES_COMPILE_H_ + +#include <string> +#include <vector> + +#include "utilities/utilities.hpp" + +namespace clblast { +// ================================================================================================= + +// Compiles a program from source code +Program CompileFromSource(const std::string &source_string, const Precision precision, + const std::string &routine_name, + const Device& device, const Context& context, + std::vector<std::string>& options, const bool silent = false); + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_UTILITIES_COMPILE_H_ +#endif diff --git a/src/utilities/timing.cpp b/src/utilities/timing.cpp new file mode 100644 index 00000000..af6a8ff2 --- /dev/null +++ b/src/utilities/timing.cpp @@ -0,0 +1,79 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file provides helper functions for time measurement and such. +// +// ================================================================================================= + +#include <cstdio> +#include <exception> + +#include "utilities/timing.hpp" + +namespace clblast { +// ================================================================================================= + +double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local) { + auto event = Event(); + + if (!local.empty()) { + // Tests for validity of the local thread sizes + if (local.size() > device.MaxWorkItemDimensions()) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions); + } + const auto max_work_item_sizes = device.MaxWorkItemSizes(); + for (auto i=size_t{0}; i<local.size(); ++i) { + if (local[i] > max_work_item_sizes[i]) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim); + } + } + auto local_size = size_t{1}; + for (auto &item: local) { local_size *= item; } + if (local_size > device.MaxWorkGroupSize()) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal); + } + + // Make sure the global thread sizes are at least equal to the local sizes + for (auto i=size_t{0}; i<global.size(); ++i) { + if (global[i] < local[i]) { global[i] = local[i]; } + } + } + + // Tests for local memory usage + const auto local_mem_usage = kernel.LocalMemUsage(device); + if (!device.IsLocalMemoryValid(local_mem_usage)) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage); + } + + // Times the kernel + const auto run_kernel_func = [&]() { + kernel.Launch(queue, global, local, event.pointer()); + event.WaitForCompletion(); + queue.Finish(); + }; + return TimeFunction(num_runs, run_kernel_func); +} + +double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local) { + try { + const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local); + printf(" %9.2lf ms |", time_ms); + return time_ms; + } + catch (...) { + const auto status_code = DispatchExceptionCatchAll(true); + printf(" error %-5d |", static_cast<int>(status_code)); + return -1.0; // invalid + } +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp index bfad6147..a66aba4b 100644 --- a/src/utilities/timing.hpp +++ b/src/utilities/timing.hpp @@ -40,6 +40,14 @@ double TimeFunction(const size_t num_runs, F const &function) { // ================================================================================================= +double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local); + +double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local); + +// ================================================================================================= + using Timing = std::pair<size_t, double>; template <typename T, typename F> @@ -47,76 +55,27 @@ std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t const size_t num_runs, const Queue& queue, const std::vector<Buffer<T>>& buffers, F const &routine) { auto timings = std::vector<Timing>(); + printf("| value | time |\n"); + printf("x--------x--------------x\n"); for (auto value = from; value < to; value += step) { - printf("[ RUN ] Running with value %zu\n", value); + printf("| %6zu |", value); try { const auto FunctionToTune = [&]() { routine(value, queue, buffers); }; const auto time_ms = TimeFunction(num_runs, FunctionToTune); - printf("[ OK ] Took %.2lf ms\n", time_ms); + printf(" %9.2lf ms |\n", time_ms); timings.push_back({value, time_ms}); } catch (...) { - printf("[ ERROR ] Exception caught\n"); + const auto status_code = DispatchExceptionCatchAll(true); + printf(" error %-5d |\n", static_cast<int>(status_code)); timings.push_back({value, -1.0}); // invalid } } + printf("x--------x--------------x\n"); return timings; } // ================================================================================================= - -using TuningParameter = std::pair<std::string, size_t>; -using TuningParameters = std::vector<TuningParameter>; -struct TuningResult { std::string name; double score; TuningParameters parameters; }; - -void PrintTimingsToFileAsJSON(const std::string &filename, - const Device& device, const Platform& platform, - const std::vector<std::pair<std::string,std::string>> &metadata, - const std::vector<TuningResult>& tuning_results) { - printf("[ STATUS ] Writing results to '%s'\n", filename.c_str()); - auto file = fopen(filename.c_str(), "w"); - fprintf(file, "{\n"); - for (auto &datum: metadata) { - fprintf(file, " \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str()); - } - fprintf(file, " \"platform_version\": \"%s\",\n", platform.Version().c_str()); - fprintf(file, " \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str()); - fprintf(file, " \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str()); - fprintf(file, " \"clblast_device_type\": \"%s\",\n", device.Type().c_str()); - fprintf(file, " \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str()); - fprintf(file, " \"device_core_clock\": \"%zu\",\n", device.CoreClock()); - fprintf(file, " \"device_compute_units\": \"%zu\",\n", device.ComputeUnits()); - fprintf(file, " \"results\": [\n"); - - // Loops over all results - auto num_results = tuning_results.size(); - for (auto r = size_t{0}; r < num_results; ++r) { - auto result = tuning_results[r]; - fprintf(file, " {\n"); - fprintf(file, " \"kernel\": \"%s\",\n", result.name.c_str()); - fprintf(file, " \"time\": %.3lf,\n", result.score); - - // Loops over all the parameters for this result - fprintf(file, " \"parameters\": {"); - auto num_configs = result.parameters.size(); - for (auto p=size_t{0}; p<num_configs; ++p) { - auto config = result.parameters[p]; - fprintf(file, "\"%s\": %zu", config.first.c_str(), config.second); - if (p < num_configs-1) { fprintf(file, ","); } - } - fprintf(file, "}\n"); - - // The footer - fprintf(file, " }"); - if (r < num_results - 1) { fprintf(file, ","); } - fprintf(file, "\n"); - } - fprintf(file, " ]\n"); - fprintf(file, "}\n"); - fclose(file); -} - -// ================================================================================================= } // namespace clblast // CLBLAST_TIMING_H_ diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index f2574104..1546fbf5 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -397,6 +397,37 @@ template <> bool PrecisionSupported<half>(const Device &device) { return device. // ================================================================================================= +// Retrieves the squared difference, used for example for computing the L2 error +template <typename T> +double SquaredDifference(const T val1, const T val2) { + const auto difference = (val1 - val2); + return static_cast<double>(difference * difference); +} + +// Compiles the default case for standard data-types +template double SquaredDifference<float>(const float, const float); +template double SquaredDifference<double>(const double, const double); + +// Specialisations for non-standard data-types +template <> +double SquaredDifference(const float2 val1, const float2 val2) { + const auto real = SquaredDifference(val1.real(), val2.real()); + const auto imag = SquaredDifference(val1.imag(), val2.imag()); + return real + imag; +} +template <> +double SquaredDifference(const double2 val1, const double2 val2) { + const auto real = SquaredDifference(val1.real(), val2.real()); + const auto imag = SquaredDifference(val1.imag(), val2.imag()); + return real + imag; +} +template <> +double SquaredDifference(const half val1, const half val2) { + return SquaredDifference(HalfToFloat(val1), HalfToFloat(val2)); +} + +// ================================================================================================= + // High-level info std::string GetDeviceType(const Device& device) { return device.Type(); diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index f56226be..e26721b3 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -98,6 +98,7 @@ constexpr auto kArgDilationW = "dilationw"; // The tuner-specific arguments in string form constexpr auto kArgFraction = "fraction"; constexpr auto kArgHeuristicSelection = "heuristic"; +constexpr auto kArgMaxL2Norm = "max_l2_norm"; // PSO tuner-specific arguments in string form constexpr auto kArgPsoSwarmSize = "pso_swarm_size"; constexpr auto kArgPsoInfGlobal = "pso_inf_global"; @@ -323,6 +324,12 @@ bool PrecisionSupported(const Device &device); // ================================================================================================= +// Retrieves the squared difference, used for example for computing the L2 error +template <typename T> +double SquaredDifference(const T val1, const T val2); + +// ================================================================================================= + // Device information in a specific CLBlast form std::string GetDeviceType(const Device& device); std::string GetDeviceVendor(const Device& device); |