summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2015-08-20 08:38:18 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2015-08-20 08:38:18 +0200
commitcf168fca7088fb050a10b2e82789c6cd9356716b (patch)
tree0f6a96976c6f74f816bfc9d53dc7c7e323e7d43d /include
parent85bd783e0d80de9aec0a4bcd49ab4acd88ab07d0 (diff)
parent15db2bcc208d8e5bccf0464396431c7d8e6f3f28 (diff)
Merge pull request #23 from CNugteren/tuner_database
Added initial version of a tuner-database
Diffstat (limited to 'include')
-rw-r--r--include/internal/tuning.h146
-rw-r--r--include/internal/utilities.h6
2 files changed, 117 insertions, 35 deletions
diff --git a/include/internal/tuning.h b/include/internal/tuning.h
index d0cf6b5d..f029c704 100644
--- a/include/internal/tuning.h
+++ b/include/internal/tuning.h
@@ -7,9 +7,8 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
-// This file implements the header for the tuner functions. This is only used for the optional
-// and stand-alone tuner binaries and not part of the core of CLBlast. The convention used here is
-// that X and Y are vectors, while A, B, and C are matrices.
+// This file implements the interface to the CLTune auto-tuner. This is only used for the optional
+// and stand-alone tuner binaries and not part of the core of CLBlast.
//
// =================================================================================================
@@ -17,44 +16,121 @@
#define CLBLAST_TUNING_H_
#include <vector>
-#include <functional>
+#include <string>
#include <cltune.h>
namespace clblast {
// =================================================================================================
-// Functions with two or three OpenCL memory buffers
-template <typename T>
-using Tuner2 = std::function<void(const Arguments<T>&,
- const std::vector<T>&, std::vector<T>&,
- cltune::Tuner&)>;
-template <typename T>
-using Tuner3 = std::function<void(const Arguments<T>&,
- const std::vector<T>&, const std::vector<T>&, std::vector<T>&,
- cltune::Tuner&)>;
-
-// As above, but now with an additional ID for the variation
-template <typename T>
-using Tuner3V = std::function<void(const Arguments<T>&, const size_t,
- const std::vector<T>&, const std::vector<T>&, std::vector<T>&,
- cltune::Tuner&)>;
-
-// Tuner for vector-vector input
-template <typename T>
-void TunerXY(int argc, char* argv[], const Tuner2<T> &tune_function);
-
-// Tuner for matrix-vector-vector input
-template <typename T>
-void TunerAXY(int argc, char* argv[], const size_t num_variations, const Tuner3V<T> &tune_function);
-
-// Tuner for matrix-matrix input
-template <typename T>
-void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function);
-
-// Tuner for matrix-matrix-matrix input
-template <typename T>
-void TunerABC(int argc, char* argv[], const Tuner3<T> &tune_function);
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for all types of kernel families. Note that this is a header-only function so
+// that it is automatically compiled for the various kernels (given as the 'C' template argument).
+template <typename C, typename T>
+void Tuner(int argc, char* argv[]) {
+
+ // Sets the parameters and platform/device for which to tune (command-line options)
+ auto help = std::string{"* Options given/available:\n"};
+ auto args = Arguments<T>{};
+ args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+ args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+ args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+ for (auto &o: C::GetOptions()) {
+ if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, C::DefaultM()); }
+ if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, C::DefaultN()); }
+ if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, C::DefaultK()); }
+ if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); }
+ if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
+ if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); }
+ }
+ fprintf(stdout, "%s\n", help.c_str());
+
+ // Tests validity of the given arguments
+ C::TestValidArguments(args);
+
+ // Tests for validity of the precision
+ {
+ auto platform = Platform(args.platform_id);
+ auto device = Device(platform, args.device_id);
+ if (!PrecisionSupported<T>(device)) {
+ printf("* Unsupported precision, skipping this tuning run\n\n");
+ return;
+ }
+ }
+
+ // Creates input buffers with random data
+ auto x_vec = std::vector<T>(C::GetSizeX(args));
+ auto y_vec = std::vector<T>(C::GetSizeY(args));
+ auto a_mat = std::vector<T>(C::GetSizeA(args));
+ auto b_mat = std::vector<T>(C::GetSizeB(args));
+ auto c_mat = std::vector<T>(C::GetSizeC(args));
+ PopulateVector(x_vec);
+ PopulateVector(y_vec);
+ PopulateVector(a_mat);
+ PopulateVector(b_mat);
+ PopulateVector(c_mat);
+
+ // Initializes the tuner for the chosen device
+ cltune::Tuner tuner(args.platform_id, args.device_id);
+
+ // Use full-search to explore all parameter combinations or random-search to search only a part of
+ // the parameter values. The fraction is set as a command-line argument.
+ if (args.fraction == 1.0 || args.fraction == 0.0) {
+ tuner.UseFullSearch();
+ }
+ else {
+ tuner.UseRandomSearch(1.0/args.fraction);
+ }
+
+ // Loads the kernel sources and defines the kernel to tune
+ auto sources = C::GetSources();
+ auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize());
+ tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSizeRef());
+
+ // Sets the tunable parameters and their possible values
+ C::SetParameters(tuner, id);
+ C::SetConstraints(tuner, id);
+ C::SetLocalMemorySize(tuner, id, args);
+
+ // Tests for a specific precision
+ tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+ tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+ // Modifies the thread-sizes (both global and local) based on the parameters
+ for (auto &parameters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); }
+ for (auto &parameters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); }
+ for (auto &parameters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); }
+ for (auto &parameters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); }
+
+ // Sets the function's arguments
+ C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat);
+
+ // Starts the tuning process
+ tuner.Tune();
+
+ // Prints the results to screen
+ auto time_ms = tuner.PrintToScreen();
+ tuner.PrintFormatted();
+
+ // Also prints the performance of the best-case in terms of GB/s or GFLOPS
+ if (time_ms != 0.0) {
+ printf("[ -------> ] %.1lf ms", time_ms);
+ printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
+ }
+
+ // Outputs the results as JSON to disk, including some meta-data
+ auto precision_string = std::to_string(static_cast<size_t>(args.precision));
+ auto metadata = std::vector<std::pair<std::string,std::string>>{
+ {"kernel_family", C::KernelFamily()},
+ {"precision", precision_string}
+ };
+ for (auto &o: C::GetOptions()) {
+ if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); }
+ if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); }
+ if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); }
+ }
+ tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata);
+}
// =================================================================================================
} // namespace clblast
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
index 6dba24e1..d9fdb9ab 100644
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@@ -198,6 +198,12 @@ bool IsMultiple(const size_t a, const size_t b);
size_t GetBytes(const Precision precision);
// =================================================================================================
+
+// Returns false is this precision is not supported by the device
+template <typename T>
+bool PrecisionSupported(const Device &device);
+
+// =================================================================================================
} // namespace clblast
// CLBLAST_UTILITIES_H_