diff options
Diffstat (limited to 'src/tuning')
-rw-r--r-- | src/tuning/kernels/copy_fast.cc (renamed from src/tuning/copy_fast.cc) | 4 | ||||
-rw-r--r-- | src/tuning/kernels/copy_pad.cc (renamed from src/tuning/copy_pad.cc) | 4 | ||||
-rw-r--r-- | src/tuning/kernels/transpose_fast.cc (renamed from src/tuning/transpose_fast.cc) | 4 | ||||
-rw-r--r-- | src/tuning/kernels/transpose_pad.cc (renamed from src/tuning/transpose_pad.cc) | 4 | ||||
-rw-r--r-- | src/tuning/kernels/xaxpy.cc (renamed from src/tuning/xaxpy.cc) | 4 | ||||
-rw-r--r-- | src/tuning/kernels/xdot.cc (renamed from src/tuning/xdot.cc) | 4 | ||||
-rw-r--r-- | src/tuning/kernels/xgemm.cc (renamed from src/tuning/xgemm.cc) | 4 | ||||
-rw-r--r-- | src/tuning/kernels/xgemv.cc (renamed from src/tuning/xgemv.cc) | 4 | ||||
-rw-r--r-- | src/tuning/kernels/xger.cc (renamed from src/tuning/xger.cc) | 4 | ||||
-rw-r--r-- | src/tuning/tuning.hpp | 161 |
10 files changed, 179 insertions, 18 deletions
diff --git a/src/tuning/copy_fast.cc b/src/tuning/kernels/copy_fast.cc index 09fdbaba..34269bc7 100644 --- a/src/tuning/copy_fast.cc +++ b/src/tuning/kernels/copy_fast.cc @@ -14,8 +14,8 @@ #include <string> #include <vector> -#include "internal/utilities.h" -#include "internal/tuning.h" +#include "utilities.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= diff --git a/src/tuning/copy_pad.cc b/src/tuning/kernels/copy_pad.cc index 7088b3bf..1e0dccd3 100644 --- a/src/tuning/copy_pad.cc +++ b/src/tuning/kernels/copy_pad.cc @@ -14,8 +14,8 @@ #include <string> #include <vector> -#include "internal/utilities.h" -#include "internal/tuning.h" +#include "utilities.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= diff --git a/src/tuning/transpose_fast.cc b/src/tuning/kernels/transpose_fast.cc index 3b0bdeb5..7ac19cb6 100644 --- a/src/tuning/transpose_fast.cc +++ b/src/tuning/kernels/transpose_fast.cc @@ -14,8 +14,8 @@ #include <string> #include <vector> -#include "internal/utilities.h" -#include "internal/tuning.h" +#include "utilities.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= diff --git a/src/tuning/transpose_pad.cc b/src/tuning/kernels/transpose_pad.cc index b9ab3ffa..63274415 100644 --- a/src/tuning/transpose_pad.cc +++ b/src/tuning/kernels/transpose_pad.cc @@ -14,8 +14,8 @@ #include <string> #include <vector> -#include "internal/utilities.h" -#include "internal/tuning.h" +#include "utilities.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= diff --git a/src/tuning/xaxpy.cc b/src/tuning/kernels/xaxpy.cc index d27cb73d..88d12c1f 100644 --- a/src/tuning/xaxpy.cc +++ b/src/tuning/kernels/xaxpy.cc @@ -14,8 +14,8 @@ #include <string> #include <vector> -#include "internal/utilities.h" -#include "internal/tuning.h" +#include "utilities.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= diff --git a/src/tuning/xdot.cc b/src/tuning/kernels/xdot.cc index 5f30296c..1581e13f 100644 --- a/src/tuning/xdot.cc +++ b/src/tuning/kernels/xdot.cc @@ -15,8 +15,8 @@ #include <string> #include <vector> -#include "internal/utilities.h" -#include "internal/tuning.h" +#include "utilities.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= diff --git a/src/tuning/xgemm.cc b/src/tuning/kernels/xgemm.cc index d309b830..4b1efdef 100644 --- a/src/tuning/xgemm.cc +++ b/src/tuning/kernels/xgemm.cc @@ -14,8 +14,8 @@ #include <string> #include <vector> -#include "internal/utilities.h" -#include "internal/tuning.h" +#include "utilities.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= diff --git a/src/tuning/xgemv.cc b/src/tuning/kernels/xgemv.cc index 6587dcf4..d42155ae 100644 --- a/src/tuning/xgemv.cc +++ b/src/tuning/kernels/xgemv.cc @@ -17,8 +17,8 @@ #include <string> #include <vector> -#include "internal/utilities.h" -#include "internal/tuning.h" +#include "utilities.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= diff --git a/src/tuning/xger.cc b/src/tuning/kernels/xger.cc index 4be80c86..d2590c53 100644 --- a/src/tuning/xger.cc +++ b/src/tuning/kernels/xger.cc @@ -14,8 +14,8 @@ #include <string> #include <vector> -#include "internal/utilities.h" -#include "internal/tuning.h" +#include "utilities.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp new file mode 100644 index 00000000..19df5f9a --- /dev/null +++ b/src/tuning/tuning.hpp @@ -0,0 +1,161 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the interface to the CLTune auto-tuner. This is only used for the optional +// and stand-alone tuner binaries and not part of the core of CLBlast. +// +// ================================================================================================= + +#ifndef CLBLAST_TUNING_H_ +#define CLBLAST_TUNING_H_ + +#include <vector> +#include <string> + +#include <cltune.h> + +#include "utilities.hpp" + +namespace clblast { +// ================================================================================================= + +// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect +// the results. Used for all types of kernel families. Note that this is a header-only function so +// that it is automatically compiled for the various kernels (given as the 'C' template argument). +template <typename C, typename T> +void Tuner(int argc, char* argv[]) { + + // Sets the parameters and platform/device for which to tune (command-line options) + auto help = std::string{"* Options given/available:\n"}; + auto args = Arguments<T>{}; + args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); + args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); + args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); + for (auto &o: C::GetOptions()) { + if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, C::DefaultM()); } + if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, C::DefaultN()); } + if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, C::DefaultK()); } + if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); } + if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); } + if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); } + } + fprintf(stdout, "%s\n", help.c_str()); + + // Tests validity of the given arguments + C::TestValidArguments(args); + + // Tests for validity of the precision and retrieves properties + auto isAMD = false; + auto isARM = false; + auto isGPU = false; + { + const auto platform = Platform(args.platform_id); + const auto device = Device(platform, args.device_id); + if (!PrecisionSupported<T>(device)) { + printf("* Unsupported precision, skipping this tuning run\n\n"); + return; + } + isAMD = device.IsAMD(); + isARM = device.IsARM(); + isGPU = device.IsGPU(); + } + + // Creates input buffers with random data + auto x_vec = std::vector<T>(C::GetSizeX(args)); + auto y_vec = std::vector<T>(C::GetSizeY(args)); + auto a_mat = std::vector<T>(C::GetSizeA(args)); + auto b_mat = std::vector<T>(C::GetSizeB(args)); + auto c_mat = std::vector<T>(C::GetSizeC(args)); + auto temp = std::vector<T>(C::GetSizeTemp(args)); + PopulateVector(x_vec); + PopulateVector(y_vec); + PopulateVector(a_mat); + PopulateVector(b_mat); + PopulateVector(c_mat); + PopulateVector(temp); + + // Initializes the tuner for the chosen device + cltune::Tuner tuner(args.platform_id, args.device_id); + + // Use full-search to explore all parameter combinations or random-search to search only a part of + // the parameter values. The fraction is set as a command-line argument. + if (args.fraction == 1.0 || args.fraction == 0.0) { + tuner.UseFullSearch(); + } + else { + tuner.UseRandomSearch(1.0/args.fraction); + } + + // Set extra settings for specific defines. This mimics src/routine.cc. + auto defines = std::string{""}; + if (isAMD && isGPU) { + defines += "#define USE_CL_MAD 1\n"; + defines += "#define USE_STAGGERED_INDICES 1\n"; + } + if (isARM && isGPU) { + defines += "#define GLOBAL_MEM_FENCE 1\n"; + } + + // Loads the kernel sources and defines the kernel to tune + auto sources = defines + C::GetSources(); + auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize()); + tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef()); + + // Sets the tunable parameters and their possible values + C::SetParameters(tuner, id); + C::SetConstraints(tuner, id); + C::SetLocalMemorySize(tuner, id, args); + + // Tests for a specific precision + tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); + tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); + + // Modifies the thread-sizes (both global and local) based on the parameters + for (auto ¶meters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); } + for (auto ¶meters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); } + for (auto ¶meters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); } + for (auto ¶meters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); } + + // Sets the function's arguments + C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp); + + // Starts the tuning process + tuner.Tune(); + + // Prints the results to screen + auto time_ms = tuner.PrintToScreen(); + tuner.PrintFormatted(); + + // Also prints the performance of the best-case in terms of GB/s or GFLOPS + if (time_ms != 0.0) { + printf("[ -------> ] %.1lf ms", time_ms); + printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str()); + } + + // Outputs the results as JSON to disk, including some meta-data + auto precision_string = std::to_string(static_cast<size_t>(args.precision)); + auto metadata = std::vector<std::pair<std::string,std::string>>{ + {"kernel_family", C::KernelFamily()}, + {"precision", precision_string} + }; + for (auto &o: C::GetOptions()) { + if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); } + if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); } + if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); } + if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); } + if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); } + } + tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata); +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TUNING_H_ +#endif |