summaryrefslogtreecommitdiff
path: root/src/tuning/tuning.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/tuning/tuning.cc')
-rw-r--r--src/tuning/tuning.cc186
1 files changed, 186 insertions, 0 deletions
diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc
new file mode 100644
index 00000000..bb93c053
--- /dev/null
+++ b/src/tuning/tuning.cc
@@ -0,0 +1,186 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the common auto-tuning code to interface with the CLTune library.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for vector-vector routines.
+template <typename T>
+void TunerXY(int argc, char* argv[], const Tuner2<T> &tune_function) {
+
+ // Sets the parameters and platform/device for which to tune (command-line options)
+ auto help = std::string{"* Options given/available:\n"};
+ auto args = Arguments<T>{};
+ args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+ args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+ args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+ args.n = GetArgument(argc, argv, help, kArgN, size_t{4096*1024});
+ args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>());
+ fprintf(stdout, "%s\n", help.c_str());
+
+ // Creates input buffers with random data
+ auto x_vec = std::vector<T>(args.n);
+ auto y_vec = std::vector<T>(args.n);
+ PopulateVector(x_vec);
+ PopulateVector(y_vec);
+
+ // Initializes the tuner for the chosen device
+ cltune::Tuner tuner(args.platform_id, args.device_id);
+
+ // Use full-search to explore all parameter combinations.
+ tuner.UseFullSearch();
+
+ // Configures the tuning parameters (kernel specific)
+ tune_function(args, x_vec, y_vec, tuner);
+
+ // Starts the tuning process
+ tuner.Tune();
+
+ // Prints the results to screen
+ auto time_ms = tuner.PrintToScreen();
+ tuner.PrintFormatted();
+
+ // Also prints the performance of the best-case in terms of GB/s
+ const auto mega_bytes = (3*args.n*GetBytes(args.precision)) * 1.0e-6;
+ if (time_ms != 0.0) {
+ printf("[ -------> ] %.1lf ms or %.1lf GB/s\n", time_ms, mega_bytes/time_ms);
+ }
+}
+
+// Compiles the above function
+template void TunerXY<float>(int, char**, const Tuner2<float>&);
+template void TunerXY<double>(int, char**, const Tuner2<double>&);
+template void TunerXY<float2>(int, char**, const Tuner2<float2>&);
+template void TunerXY<double2>(int, char**, const Tuner2<double2>&);
+
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for matrix-matrix routines.
+template <typename T>
+void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function) {
+
+ // Sets the parameters and platform/device for which to tune (command-line options)
+ auto help = std::string{"* Options given/available:\n"};
+ auto args = Arguments<T>{};
+ args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+ args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+ args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+ args.m = GetArgument(argc, argv, help, kArgM, size_t{1024});
+ args.n = GetArgument(argc, argv, help, kArgN, size_t{1024});
+ args.fraction = GetArgument(argc, argv, help, kArgFraction, 2048.0);
+ fprintf(stdout, "%s\n", help.c_str());
+
+ // Creates input buffers with random data
+ auto a_mat = std::vector<T>(args.m * args.n);
+ auto b_mat = std::vector<T>(args.m * args.n);
+ PopulateVector(a_mat);
+ PopulateVector(b_mat);
+
+ // Initializes the tuner for the chosen device
+ cltune::Tuner tuner(args.platform_id, args.device_id);
+
+ // Use full-search to explore all parameter combinations.
+ tuner.UseFullSearch();
+
+ // Configures the tuning parameters (kernel specific)
+ tune_function(args, a_mat, b_mat, tuner);
+
+ // Starts the tuning process
+ tuner.Tune();
+
+ // Prints the results to screen
+ auto time_ms = tuner.PrintToScreen();
+ tuner.PrintFormatted();
+
+ // Also prints the performance of the best-case in terms of GB/s
+ const auto mega_bytes = (2*args.m*args.n*GetBytes(args.precision)) * 1.0e-6;
+ if (time_ms != 0.0) {
+ printf("[ -------> ] %.1lf ms or %.1lf GB/s\n", time_ms, mega_bytes/time_ms);
+ }
+}
+
+// Compiles the above function
+template void TunerAB<float>(int, char**, const Tuner2<float>&);
+template void TunerAB<double>(int, char**, const Tuner2<double>&);
+template void TunerAB<float2>(int, char**, const Tuner2<float2>&);
+template void TunerAB<double2>(int, char**, const Tuner2<double2>&);
+
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for matrix-matrix-matrix routines.
+template <typename T>
+void TunerABC(int argc, char* argv[], const Tuner3<T> &tune_function) {
+
+ // Sets the parameters and platform/device for which to tune (command-line options)
+ auto help = std::string{"* Options given/available:\n"};
+ auto args = Arguments<T>{};
+ args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+ args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+ args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+ args.m = GetArgument(argc, argv, help, kArgM, size_t{1024});
+ args.n = GetArgument(argc, argv, help, kArgN, size_t{1024});
+ args.k = GetArgument(argc, argv, help, kArgK, size_t{1024});
+ args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>());
+ args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>());
+ args.fraction = GetArgument(argc, argv, help, kArgFraction, 2048.0);
+ fprintf(stdout, "%s\n", help.c_str());
+
+ // Creates input buffers with random data
+ auto a_mat = std::vector<T>(args.m * args.k);
+ auto b_mat = std::vector<T>(args.n * args.k);
+ auto c_mat = std::vector<T>(args.m * args.n);
+ PopulateVector(a_mat);
+ PopulateVector(b_mat);
+ PopulateVector(c_mat);
+
+ // Initializes the tuner for the chosen device
+ cltune::Tuner tuner(args.platform_id, args.device_id);
+
+ // Use random-search to search only a part of the parameter values. The fraction of the search-
+ // space to explore is set as a command-line argument.
+ tuner.UseRandomSearch(1.0/args.fraction);
+
+ // Configures the tuning parameters (kernel specific)
+ tune_function(args, a_mat, b_mat, c_mat, tuner);
+
+ // Starts the tuning process
+ tuner.Tune();
+
+ // Prints the results to screen
+ auto time_ms = tuner.PrintToScreen();
+ tuner.PrintFormatted();
+
+ // Also prints the performance of the best-case in terms of GFLOPS
+ const auto mega_flops = (2*args.m*args.n*args.k) * 1.0e-6;
+ if (time_ms != 0.0) {
+ printf("[ -------> ] %.1lf ms or %.1lf GFLOPS\n", time_ms, mega_flops/time_ms);
+ }
+}
+
+// Compiles the above function
+template void TunerABC<float>(int, char**, const Tuner3<float>&);
+template void TunerABC<double>(int, char**, const Tuner3<double>&);
+template void TunerABC<float2>(int, char**, const Tuner3<float2>&);
+template void TunerABC<double2>(int, char**, const Tuner3<double2>&);
+
+// =================================================================================================
+} // namespace clblast