diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2015-08-20 08:38:18 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2015-08-20 08:38:18 +0200 |
commit | cf168fca7088fb050a10b2e82789c6cd9356716b (patch) | |
tree | 0f6a96976c6f74f816bfc9d53dc7c7e323e7d43d /include | |
parent | 85bd783e0d80de9aec0a4bcd49ab4acd88ab07d0 (diff) | |
parent | 15db2bcc208d8e5bccf0464396431c7d8e6f3f28 (diff) |
Merge pull request #23 from CNugteren/tuner_database
Added initial version of a tuner-database
Diffstat (limited to 'include')
-rw-r--r-- | include/internal/tuning.h | 146 | ||||
-rw-r--r-- | include/internal/utilities.h | 6 |
2 files changed, 117 insertions, 35 deletions
diff --git a/include/internal/tuning.h b/include/internal/tuning.h index d0cf6b5d..f029c704 100644 --- a/include/internal/tuning.h +++ b/include/internal/tuning.h @@ -7,9 +7,8 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file implements the header for the tuner functions. This is only used for the optional -// and stand-alone tuner binaries and not part of the core of CLBlast. The convention used here is -// that X and Y are vectors, while A, B, and C are matrices. +// This file implements the interface to the CLTune auto-tuner. This is only used for the optional +// and stand-alone tuner binaries and not part of the core of CLBlast. // // ================================================================================================= @@ -17,44 +16,121 @@ #define CLBLAST_TUNING_H_ #include <vector> -#include <functional> +#include <string> #include <cltune.h> namespace clblast { // ================================================================================================= -// Functions with two or three OpenCL memory buffers -template <typename T> -using Tuner2 = std::function<void(const Arguments<T>&, - const std::vector<T>&, std::vector<T>&, - cltune::Tuner&)>; -template <typename T> -using Tuner3 = std::function<void(const Arguments<T>&, - const std::vector<T>&, const std::vector<T>&, std::vector<T>&, - cltune::Tuner&)>; - -// As above, but now with an additional ID for the variation -template <typename T> -using Tuner3V = std::function<void(const Arguments<T>&, const size_t, - const std::vector<T>&, const std::vector<T>&, std::vector<T>&, - cltune::Tuner&)>; - -// Tuner for vector-vector input -template <typename T> -void TunerXY(int argc, char* argv[], const Tuner2<T> &tune_function); - -// Tuner for matrix-vector-vector input -template <typename T> -void TunerAXY(int argc, char* argv[], const size_t num_variations, const Tuner3V<T> &tune_function); - -// Tuner for matrix-matrix input -template <typename T> -void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function); - -// Tuner for matrix-matrix-matrix input -template <typename T> -void TunerABC(int argc, char* argv[], const Tuner3<T> &tune_function); +// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect +// the results. Used for all types of kernel families. Note that this is a header-only function so +// that it is automatically compiled for the various kernels (given as the 'C' template argument). +template <typename C, typename T> +void Tuner(int argc, char* argv[]) { + + // Sets the parameters and platform/device for which to tune (command-line options) + auto help = std::string{"* Options given/available:\n"}; + auto args = Arguments<T>{}; + args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); + args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); + args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); + for (auto &o: C::GetOptions()) { + if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, C::DefaultM()); } + if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, C::DefaultN()); } + if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, C::DefaultK()); } + if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); } + if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); } + if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); } + } + fprintf(stdout, "%s\n", help.c_str()); + + // Tests validity of the given arguments + C::TestValidArguments(args); + + // Tests for validity of the precision + { + auto platform = Platform(args.platform_id); + auto device = Device(platform, args.device_id); + if (!PrecisionSupported<T>(device)) { + printf("* Unsupported precision, skipping this tuning run\n\n"); + return; + } + } + + // Creates input buffers with random data + auto x_vec = std::vector<T>(C::GetSizeX(args)); + auto y_vec = std::vector<T>(C::GetSizeY(args)); + auto a_mat = std::vector<T>(C::GetSizeA(args)); + auto b_mat = std::vector<T>(C::GetSizeB(args)); + auto c_mat = std::vector<T>(C::GetSizeC(args)); + PopulateVector(x_vec); + PopulateVector(y_vec); + PopulateVector(a_mat); + PopulateVector(b_mat); + PopulateVector(c_mat); + + // Initializes the tuner for the chosen device + cltune::Tuner tuner(args.platform_id, args.device_id); + + // Use full-search to explore all parameter combinations or random-search to search only a part of + // the parameter values. The fraction is set as a command-line argument. + if (args.fraction == 1.0 || args.fraction == 0.0) { + tuner.UseFullSearch(); + } + else { + tuner.UseRandomSearch(1.0/args.fraction); + } + + // Loads the kernel sources and defines the kernel to tune + auto sources = C::GetSources(); + auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize()); + tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSizeRef()); + + // Sets the tunable parameters and their possible values + C::SetParameters(tuner, id); + C::SetConstraints(tuner, id); + C::SetLocalMemorySize(tuner, id, args); + + // Tests for a specific precision + tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); + tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); + + // Modifies the thread-sizes (both global and local) based on the parameters + for (auto ¶meters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); } + for (auto ¶meters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); } + for (auto ¶meters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); } + for (auto ¶meters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); } + + // Sets the function's arguments + C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat); + + // Starts the tuning process + tuner.Tune(); + + // Prints the results to screen + auto time_ms = tuner.PrintToScreen(); + tuner.PrintFormatted(); + + // Also prints the performance of the best-case in terms of GB/s or GFLOPS + if (time_ms != 0.0) { + printf("[ -------> ] %.1lf ms", time_ms); + printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str()); + } + + // Outputs the results as JSON to disk, including some meta-data + auto precision_string = std::to_string(static_cast<size_t>(args.precision)); + auto metadata = std::vector<std::pair<std::string,std::string>>{ + {"kernel_family", C::KernelFamily()}, + {"precision", precision_string} + }; + for (auto &o: C::GetOptions()) { + if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); } + if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); } + if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); } + } + tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata); +} // ================================================================================================= } // namespace clblast diff --git a/include/internal/utilities.h b/include/internal/utilities.h index 6dba24e1..d9fdb9ab 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -198,6 +198,12 @@ bool IsMultiple(const size_t a, const size_t b); size_t GetBytes(const Precision precision); // ================================================================================================= + +// Returns false is this precision is not supported by the device +template <typename T> +bool PrecisionSupported(const Device &device); + +// ================================================================================================= } // namespace clblast // CLBLAST_UTILITIES_H_ |