diff options
Diffstat (limited to 'src/tuning/tuning_api.cpp')
-rw-r--r-- | src/tuning/tuning_api.cpp | 232 |
1 files changed, 232 insertions, 0 deletions
diff --git a/src/tuning/tuning_api.cpp b/src/tuning/tuning_api.cpp new file mode 100644 index 00000000..94a9a367 --- /dev/null +++ b/src/tuning/tuning_api.cpp @@ -0,0 +1,232 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune). +// This is only used for the optional tuner binaries and not part of the core of CLBlast. +// +// ================================================================================================= + +#include <vector> +#include <string> +#include <random> +#include <utility> +#include <algorithm> +#include <cstdio> + +#include "tuning/tuning.hpp" +#include "tuning/kernels/copy_fast.hpp" + +namespace clblast { +// ================================================================================================= + +template <typename T> +StatusCode TuneCopyMatrixFast(RawCommandQueue * queue, const size_t m, const size_t n, + const double fraction, std::unordered_map<std::string,size_t> ¶meters) { + auto args = Arguments<T>(); + args.m = m; + args.n = n; + args.fraction = fraction; + auto queue_cpp = Queue(*queue); + return TunerAPI<T>(queue_cpp, args, 0, GetTunerDefaults, GetTunerSettings<T>, + TestValidArguments<T>, SetConstraints, SetArguments<T>, parameters); +} + +// Compiles the above +template StatusCode TuneCopyMatrixFast<half>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&); +template StatusCode TuneCopyMatrixFast<float>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&); +template StatusCode TuneCopyMatrixFast<double>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&); +template StatusCode TuneCopyMatrixFast<float2>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&); +template StatusCode TuneCopyMatrixFast<double2>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&); + +// ================================================================================================= + +// The main tuner API, similar to the one in tuning.cpp, but without I/O +template <typename T> +StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V, + const GetTunerDefaultsFunc GetTunerDefaults, + const GetTunerSettingsFunc<T> GetTunerSettings, + const TestValidArgumentsFunc<T> TestValidArguments, + const SetConstraintsFunc SetConstraints, + const SetArgumentsFunc<T> SetArguments, + std::unordered_map<std::string,size_t> ¶meters) { + + // Sets the parameters and platform/device for which to tune (command-line options) + const TunerDefaults defaults = GetTunerDefaults(V); + const TunerSettings settings = GetTunerSettings(V, args); + + // Tests validity of the given arguments + TestValidArguments(V, args); + + // Retrieves OpenCL classes + const auto device = queue.GetDevice(); + const auto context = queue.GetContext(); + + // Inspects whether or not FP64 is supported in case of double precision + if ((PrecisionValue<T>() == Precision::kDouble && !PrecisionSupported<double>(device)) || + (PrecisionValue<T>() == Precision::kComplexDouble && !PrecisionSupported<double2>(device))) { + return StatusCode::kNoDoublePrecision; + } + + // As above, but for FP16 (half precision) + if (PrecisionValue<T>() == Precision::kHalf && !PrecisionSupported<half>(device)) { + return StatusCode::kNoHalfPrecision; + } + + // Retrieves properties + const auto device_type = GetDeviceType(device); + const auto device_vendor = GetDeviceVendor(device); + const auto device_architecture = GetDeviceArchitecture(device); + const auto device_name = GetDeviceName(device); + + // Creates input buffers with random data + const auto buffer_sizes = std::vector<size_t>{ + settings.size_x, settings.size_y, + settings.size_a, settings.size_b, settings.size_c, + settings.size_temp + }; + const auto seed = static_cast<unsigned long>(time(nullptr)); + std::mt19937 mt(seed); + std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit); + auto source_buffers = std::vector<std::vector<T>>(); + auto reference_buffers = std::vector<std::vector<T>>(); + auto result_buffers = std::vector<std::vector<T>>(); + auto device_buffers = std::vector<Buffer<T>>(); + for (const auto size : buffer_sizes) { + auto host_buffer = std::vector<T>(size); + PopulateVector(host_buffer, mt, dist); + source_buffers.push_back(host_buffer); + reference_buffers.push_back(std::vector<T>(size)); + result_buffers.push_back(std::vector<T>(size)); + device_buffers.push_back(Buffer<T>(context, size)); + } + + // Sets the tunable parameters and their possible values + auto configurations = SetConfigurations(settings.parameters, SetConstraints(V)); + + // Select the search method (full search or a random fraction) + if (args.fraction != 0.0 && args.fraction != 1.0) { + const auto new_size = static_cast<size_t>(configurations.size() * args.fraction); + auto rng = std::default_random_engine{}; + std::shuffle(std::begin(configurations), std::end(configurations), rng); + configurations.resize(new_size); + } + + // First runs a reference example to compare against + try { + + // Sets the input + for (const auto id : settings.inputs) { + device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); + } + + // Compiles the kernel + auto compiler_options = std::vector<std::string>(); + const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name, + device, context, compiler_options, 0); + auto kernel = Kernel(program, settings.kernel_name); + SetArguments(V, kernel, args, device_buffers); + + // Runs the kernel + const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, + settings.global_size_ref, settings.local_size_ref, true); + if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); } + + // Saves the result + for (const auto id : settings.outputs) { + device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]); + } + } + catch (...) { + const auto status_code = DispatchExceptionCatchAll(true); + return status_code; + } + + // Starts the tuning process + auto results = std::vector<TuningResult>(); + for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) { + try { + auto configuration = configurations[config_id]; + + // Sets the input + for (const auto id : settings.inputs) { + device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); + } + + // Sets the thread configuration + const auto global = SetThreadConfiguration(configuration, settings.global_size, + settings.mul_global, settings.div_global); + const auto local = SetThreadConfiguration(configuration, settings.local_size, + settings.mul_local, settings.div_local); + + // Sets the parameters for this configuration + auto kernel_source = std::string{""}; + for (const auto ¶meter : configuration) { + kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n"; + } + kernel_source += settings.sources; + + // Compiles the kernel + auto compiler_options = std::vector<std::string>(); + const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name, + device, context, compiler_options, 0, true); + auto kernel = Kernel(program, settings.kernel_name); + + // Runs the kernel + SetArguments(V, kernel, args, device_buffers); + const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local, true); + + // Kernel run was not successful + if (time_ms == -1.0) { + continue; + } + + // Compares the results + auto l2_error = 0.0; + for (const auto id : settings.outputs) { + device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]); + for (auto index = size_t{0}; index<buffer_sizes[id]; ++index) { + const auto diff = SquaredDifference(result_buffers[id][index], reference_buffers[id][index]); + l2_error += diff; + } + l2_error /= static_cast<double>(buffer_sizes[id]); + if (std::isnan(l2_error) || l2_error > 1.0e-4) { + throw std::runtime_error("L2 error too large"); + } + } + results.push_back(TuningResult{settings.kernel_name, time_ms, configuration}); + } + catch (...) { + } + } + + // Completed the tuning process + if (results.size() == 0) { return StatusCode::kUnexpectedError; } + + // Computes the best results + auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; + const auto best_configuration = std::min_element(results.begin(), results.end(), comparison); + const auto best_time_ms = best_configuration->score; + if (best_time_ms == 0.0) { return StatusCode::kUnexpectedError; } + + // Stores the best parameters + for (const auto config : best_configuration->config) { + parameters[config.first] = config.second; + } + return StatusCode::kSuccess; +} + +// Compiles the above function +template StatusCode TunerAPI<half>(Queue &queue, const Arguments<half> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<half> GetTunerSettings, const TestValidArgumentsFunc<half> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<half> SetArguments, std::unordered_map<std::string,size_t>&); +template StatusCode TunerAPI<float>(Queue &queue, const Arguments<float> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<float> GetTunerSettings, const TestValidArgumentsFunc<float> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<float> SetArguments, std::unordered_map<std::string,size_t>&); +template StatusCode TunerAPI<double>(Queue &queue, const Arguments<double> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<double> GetTunerSettings, const TestValidArgumentsFunc<double> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<double> SetArguments, std::unordered_map<std::string,size_t>&); +template StatusCode TunerAPI<float2>(Queue &queue, const Arguments<float2> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<float2> GetTunerSettings, const TestValidArgumentsFunc<float2> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<float2> SetArguments, std::unordered_map<std::string,size_t>&); +template StatusCode TunerAPI<double2>(Queue &queue, const Arguments<double2> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<double2> GetTunerSettings, const TestValidArgumentsFunc<double2> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<double2> SetArguments, std::unordered_map<std::string,size_t>&); + +// ================================================================================================= +} // namespace clblast |