+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+// Author(s):
+// Cedric Nugteren <>
+// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune).
+// This is only used for the optional tuner binaries and not part of the core of CLBlast.
+// =================================================================================================
+#include <vector>
+#include <string>
+#include <random>
+#include <utility>
+#include <algorithm>
+#include <cstdio>
+#include "tuning/tuning.hpp"
+#include "tuning/kernels/copy_fast.hpp"
+namespace clblast {
+// =================================================================================================
+template <typename T>
+StatusCode TuneCopyMatrixFast(RawCommandQueue * queue, const size_t m, const size_t n,
+ const double fraction, std::unordered_map<std::string,size_t> &parameters) {
+ auto args = Arguments<T>();
+ args.m = m;
+ args.n = n;
+ args.fraction = fraction;
+ auto queue_cpp = Queue(*queue);
+ return TunerAPI<T>(queue_cpp, args, 0, GetTunerDefaults, GetTunerSettings<T>,
+ TestValidArguments<T>, SetConstraints, SetArguments<T>, parameters);
+// Compiles the above
+template StatusCode TuneCopyMatrixFast<half>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&);
+template StatusCode TuneCopyMatrixFast<float>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&);
+template StatusCode TuneCopyMatrixFast<double>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&);
+template StatusCode TuneCopyMatrixFast<float2>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&);
+template StatusCode TuneCopyMatrixFast<double2>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&);
+// =================================================================================================
+// The main tuner API, similar to the one in tuning.cpp, but without I/O
+template <typename T>
+StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V,
+ const GetTunerDefaultsFunc GetTunerDefaults,
+ const GetTunerSettingsFunc<T> GetTunerSettings,
+ const TestValidArgumentsFunc<T> TestValidArguments,
+ const SetConstraintsFunc SetConstraints,
+ const SetArgumentsFunc<T> SetArguments,
+ std::unordered_map<std::string,size_t> &parameters) {
+ // Sets the parameters and platform/device for which to tune (command-line options)
+ const TunerDefaults defaults = GetTunerDefaults(V);
+ const TunerSettings settings = GetTunerSettings(V, args);
+ // Tests validity of the given arguments
+ TestValidArguments(V, args);
+ // Retrieves OpenCL classes
+ const auto device = queue.GetDevice();
+ const auto context = queue.GetContext();
+ // Inspects whether or not FP64 is supported in case of double precision
+ if ((PrecisionValue<T>() == Precision::kDouble && !PrecisionSupported<double>(device)) ||
+ (PrecisionValue<T>() == Precision::kComplexDouble && !PrecisionSupported<double2>(device))) {
+ return StatusCode::kNoDoublePrecision;
+ }
+ // As above, but for FP16 (half precision)
+ if (PrecisionValue<T>() == Precision::kHalf && !PrecisionSupported<half>(device)) {
+ return StatusCode::kNoHalfPrecision;
+ }
+ // Retrieves properties
+ const auto device_type = GetDeviceType(device);
+ const auto device_vendor = GetDeviceVendor(device);
+ const auto device_architecture = GetDeviceArchitecture(device);
+ const auto device_name = GetDeviceName(device);
+ // Creates input buffers with random data
+ const auto buffer_sizes = std::vector<size_t>{
+ settings.size_x, settings.size_y,
+ settings.size_a, settings.size_b, settings.size_c,
+ settings.size_temp
+ };
+ const auto seed = static_cast<unsigned long>(time(nullptr));
+ std::mt19937 mt(seed);
+ std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
+ auto source_buffers = std::vector<std::vector<T>>();
+ auto reference_buffers = std::vector<std::vector<T>>();
+ auto result_buffers = std::vector<std::vector<T>>();
+ auto device_buffers = std::vector<Buffer<T>>();
+ for (const auto size : buffer_sizes) {
+ auto host_buffer = std::vector<T>(size);
+ PopulateVector(host_buffer, mt, dist);
+ source_buffers.push_back(host_buffer);
+ reference_buffers.push_back(std::vector<T>(size));
+ result_buffers.push_back(std::vector<T>(size));
+ device_buffers.push_back(Buffer<T>(context, size));
+ }
+ // Sets the tunable parameters and their possible values
+ auto configurations = SetConfigurations(settings.parameters, SetConstraints(V));
+ // Select the search method (full search or a random fraction)
+ if (args.fraction != 0.0 && args.fraction != 1.0) {
+ const auto new_size = static_cast<size_t>(configurations.size() * args.fraction);
+ auto rng = std::default_random_engine{};
+ std::shuffle(std::begin(configurations), std::end(configurations), rng);
+ configurations.resize(new_size);
+ }
+ // First runs a reference example to compare against
+ try {
+ // Sets the input
+ for (const auto id : settings.inputs) {
+ device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
+ }
+ // Compiles the kernel
+ auto compiler_options = std::vector<std::string>();
+ const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name,
+ device, context, compiler_options, 0);
+ auto kernel = Kernel(program, settings.kernel_name);
+ SetArguments(V, kernel, args, device_buffers);
+ // Runs the kernel
+ const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device,
+ settings.global_size_ref, settings.local_size_ref, true);
+ if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); }
+ // Saves the result
+ for (const auto id : settings.outputs) {
+ device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]);
+ }
+ }
+ catch (...) {
+ const auto status_code = DispatchExceptionCatchAll(true);
+ return status_code;
+ }
+ // Starts the tuning process
+ auto results = std::vector<TuningResult>();
+ for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) {
+ try {
+ auto configuration = configurations[config_id];
+ // Sets the input
+ for (const auto id : settings.inputs) {
+ device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
+ }
+ // Sets the thread configuration
+ const auto global = SetThreadConfiguration(configuration, settings.global_size,
+ settings.mul_global, settings.div_global);
+ const auto local = SetThreadConfiguration(configuration, settings.local_size,
+ settings.mul_local, settings.div_local);
+ // Sets the parameters for this configuration
+ auto kernel_source = std::string{""};
+ for (const auto &parameter : configuration) {
+ kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n";
+ }
+ kernel_source += settings.sources;
+ // Compiles the kernel
+ auto compiler_options = std::vector<std::string>();
+ const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name,
+ device, context, compiler_options, 0, true);
+ auto kernel = Kernel(program, settings.kernel_name);
+ // Runs the kernel
+ SetArguments(V, kernel, args, device_buffers);
+ const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local, true);
+ // Kernel run was not successful
+ if (time_ms == -1.0) {
+ continue;
+ }
+ // Compares the results
+ auto l2_error = 0.0;
+ for (const auto id : settings.outputs) {
+ device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]);
+ for (auto index = size_t{0}; index<buffer_sizes[id]; ++index) {
+ const auto diff = SquaredDifference(result_buffers[id][index], reference_buffers[id][index]);
+ l2_error += diff;
+ }
+ l2_error /= static_cast<double>(buffer_sizes[id]);
+ if (std::isnan(l2_error) || l2_error > 1.0e-4) {
+ throw std::runtime_error("L2 error too large");
+ }
+ }
+ results.push_back(TuningResult{settings.kernel_name, time_ms, configuration});
+ }
+ catch (...) {
+ }
+ }
+ // Completed the tuning process
+ if (results.size() == 0) { return StatusCode::kUnexpectedError; }
+ // Computes the best results
+ auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
+ const auto best_configuration = std::min_element(results.begin(), results.end(), comparison);
+ const auto best_time_ms = best_configuration->score;
+ if (best_time_ms == 0.0) { return StatusCode::kUnexpectedError; }
+ // Stores the best parameters
+ for (const auto config : best_configuration->config) {
+ parameters[config.first] = config.second;
+ }
+ return StatusCode::kSuccess;
+// Compiles the above function
+template StatusCode TunerAPI<half>(Queue &queue, const Arguments<half> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<half> GetTunerSettings, const TestValidArgumentsFunc<half> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<half> SetArguments, std::unordered_map<std::string,size_t>&);
+template StatusCode TunerAPI<float>(Queue &queue, const Arguments<float> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<float> GetTunerSettings, const TestValidArgumentsFunc<float> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<float> SetArguments, std::unordered_map<std::string,size_t>&);
+template StatusCode TunerAPI<double>(Queue &queue, const Arguments<double> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<double> GetTunerSettings, const TestValidArgumentsFunc<double> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<double> SetArguments, std::unordered_map<std::string,size_t>&);
+template StatusCode TunerAPI<float2>(Queue &queue, const Arguments<float2> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<float2> GetTunerSettings, const TestValidArgumentsFunc<float2> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<float2> SetArguments, std::unordered_map<std::string,size_t>&);
+template StatusCode TunerAPI<double2>(Queue &queue, const Arguments<double2> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<double2> GetTunerSettings, const TestValidArgumentsFunc<double2> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<double2> SetArguments, std::unordered_map<std::string,size_t>&);
+// =================================================================================================
+} // namespace clblast