// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune). // This is only used for the optional tuner binaries and not part of the core of CLBlast. // // ================================================================================================= #include #include #include #include #include #include "tuning/tuning.hpp" #include "tuning/kernels/xaxpy.hpp" #include "tuning/kernels/xdot.hpp" #include "tuning/kernels/xgemv.hpp" #include "tuning/kernels/xger.hpp" #include "tuning/kernels/xgemm.hpp" #include "tuning/kernels/xgemm_direct.hpp" #include "tuning/kernels/copy_fast.hpp" #include "tuning/kernels/copy_pad.hpp" #include "tuning/kernels/transpose_fast.hpp" #include "tuning/kernels/transpose_pad.hpp" #include "tuning/kernels/invert.hpp" namespace clblast { // ================================================================================================= template StatusCode TuneXaxpy(RawCommandQueue * queue, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, XaxpyGetTunerDefaults, XaxpyGetTunerSettings, XaxpyTestValidArguments, XaxpySetConstraints, XaxpyComputeLocalMemSize, XaxpySetArguments, parameters); } template StatusCode PUBLIC_API TuneXaxpy(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXaxpy(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXaxpy(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXaxpy(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXaxpy(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode TuneXdot(RawCommandQueue * queue, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.n = n; auto queue_cpp = Queue(*queue); auto status = TunerAPI(queue_cpp, args, 1, XdotGetTunerDefaults, XdotGetTunerSettings, XdotTestValidArguments, XdotSetConstraints, XdotComputeLocalMemSize, XdotSetArguments, parameters); if (status != StatusCode::kSuccess) { return status; } return TunerAPI(queue_cpp, args, 2, XdotGetTunerDefaults, XdotGetTunerSettings, XdotTestValidArguments, XdotSetConstraints, XdotComputeLocalMemSize, XdotSetArguments, parameters); } template StatusCode PUBLIC_API TuneXdot(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXdot(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXdot(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXdot(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXdot(RawCommandQueue*, const size_t, const double, std::unordered_map&); template StatusCode TuneXgemv(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); auto status = TunerAPI(queue_cpp, args, 1, XgemvGetTunerDefaults, XgemvGetTunerSettings, XgemvTestValidArguments, XgemvSetConstraints, XgemvComputeLocalMemSize, XgemvSetArguments, parameters); if (status != StatusCode::kSuccess) { return status; } status = TunerAPI(queue_cpp, args, 2, XgemvGetTunerDefaults, XgemvGetTunerSettings, XgemvTestValidArguments, XgemvSetConstraints, XgemvComputeLocalMemSize, XgemvSetArguments, parameters); if (status != StatusCode::kSuccess) { return status; } return TunerAPI(queue_cpp, args, 3, XgemvGetTunerDefaults, XgemvGetTunerSettings, XgemvTestValidArguments, XgemvSetConstraints, XgemvComputeLocalMemSize, XgemvSetArguments, parameters); } template StatusCode PUBLIC_API TuneXgemv(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemv(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemv(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemv(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemv(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneXger(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, XgerGetTunerDefaults, XgerGetTunerSettings, XgerTestValidArguments, XgerSetConstraints, XgerComputeLocalMemSize, XgerSetArguments, parameters); } template StatusCode PUBLIC_API TuneXger(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXger(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXger(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXger(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXger(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneXgemm(RawCommandQueue * queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; args.k = k; auto queue_cpp = Queue(*queue); auto status = TunerAPI(queue_cpp, args, 2, XgemmGetTunerDefaults, XgemmGetTunerSettings, XgemmTestValidArguments, XgemmSetConstraints, XgemmComputeLocalMemSize, XgemmSetArguments, parameters); if (status != StatusCode::kSuccess) { return status; } return TunerAPI(queue_cpp, args, 12, XgemmGetTunerDefaults, XgemmGetTunerSettings, XgemmTestValidArguments, XgemmSetConstraints, XgemmComputeLocalMemSize, XgemmSetArguments, parameters); } template StatusCode PUBLIC_API TuneXgemm(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemm(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemm(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemm(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemm(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneXgemmDirect(RawCommandQueue * queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; args.k = k; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 2, XgemmDirectGetTunerDefaults, XgemmDirectGetTunerSettings, XgemmDirectTestValidArguments, XgemmDirectSetConstraints, XgemmDirectComputeLocalMemSize, XgemmDirectSetArguments, parameters); } template StatusCode PUBLIC_API TuneXgemmDirect(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemmDirect(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemmDirect(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemmDirect(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneXgemmDirect(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneCopy(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, CopyGetTunerDefaults, CopyGetTunerSettings, CopyTestValidArguments, CopySetConstraints, CopyComputeLocalMemSize, CopySetArguments, parameters); } template StatusCode PUBLIC_API TuneCopy(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneCopy(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneCopy(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneCopy(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneCopy(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TunePad(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, PadGetTunerDefaults, PadGetTunerSettings, PadTestValidArguments, PadSetConstraints, PadComputeLocalMemSize, PadSetArguments, parameters); } template StatusCode PUBLIC_API TunePad(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePad(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePad(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePad(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePad(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneTranspose(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, TransposeGetTunerDefaults, TransposeGetTunerSettings, TransposeTestValidArguments, TransposeSetConstraints, TransposeComputeLocalMemSize, TransposeSetArguments, parameters); } template StatusCode PUBLIC_API TuneTranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneTranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneTranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneTranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneTranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TunePadtranspose(RawCommandQueue * queue, const size_t m, const size_t n, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, PadtransposeGetTunerDefaults, PadtransposeGetTunerSettings, PadtransposeTestValidArguments, PadtransposeSetConstraints, PadtransposeComputeLocalMemSize, PadtransposeSetArguments, parameters); } template StatusCode PUBLIC_API TunePadtranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePadtranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePadtranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePadtranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TunePadtranspose(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); template StatusCode TuneInvert(RawCommandQueue * queue, const size_t m, const size_t n, const size_t k, const double fraction, std::unordered_map ¶meters) { auto args = Arguments(); args.fraction = fraction; args.m = m; args.n = n; args.k = k; auto queue_cpp = Queue(*queue); return TunerAPI(queue_cpp, args, 0, InvertGetTunerDefaults, InvertGetTunerSettings, InvertTestValidArguments, InvertSetConstraints, InvertComputeLocalMemSize, InvertSetArguments, parameters); } template StatusCode PUBLIC_API TuneInvert(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneInvert(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneInvert(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneInvert(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); template StatusCode PUBLIC_API TuneInvert(RawCommandQueue*, const size_t, const size_t, const size_t, const double, std::unordered_map&); // ================================================================================================= // The main tuner API, similar to the one in tuning.cpp, but without I/O template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map ¶meters) { // Sets the parameters and platform/device for which to tune (command-line options) const TunerDefaults defaults = GetTunerDefaults(V); const TunerSettings settings = GetTunerSettings(V, args); // Tests validity of the given arguments TestValidArguments(V, args); // Retrieves OpenCL classes const auto device = queue.GetDevice(); const auto context = queue.GetContext(); // Inspects whether or not FP64 is supported in case of double precision if ((PrecisionValue() == Precision::kDouble && !PrecisionSupported(device)) || (PrecisionValue() == Precision::kComplexDouble && !PrecisionSupported(device))) { return StatusCode::kNoDoublePrecision; } // As above, but for FP16 (half precision) if (PrecisionValue() == Precision::kHalf && !PrecisionSupported(device)) { return StatusCode::kNoHalfPrecision; } // Retrieves properties const auto device_type = GetDeviceType(device); const auto device_vendor = GetDeviceVendor(device); const auto device_architecture = GetDeviceArchitecture(device); const auto device_name = GetDeviceName(device); // Creates input buffers with random data. Adds a 'canary' region to detect buffer overflows. const auto buffer_sizes = std::vector{ settings.size_x + kCanarySize, settings.size_y + kCanarySize, settings.size_a + kCanarySize, settings.size_b + kCanarySize, settings.size_c + kCanarySize, settings.size_temp + kCanarySize }; const auto seed = static_cast(time(nullptr)); std::mt19937 mt(seed); std::uniform_real_distribution dist(kTestDataLowerLimit, kTestDataUpperLimit); auto source_buffers = std::vector>(); auto reference_buffers = std::vector>(); auto result_buffers = std::vector>(); auto device_buffers = std::vector>(); for (const auto size : buffer_sizes) { auto host_buffer = std::vector(size); PopulateVector(host_buffer, mt, dist); source_buffers.push_back(host_buffer); reference_buffers.push_back(std::vector(size)); result_buffers.push_back(std::vector(size)); device_buffers.push_back(Buffer(context, size)); } // Sets the tunable parameters and their possible values auto configurations = SetConfigurations(device, settings.parameters, settings.local_size, settings.mul_local, settings.div_local, SetConstraints(V), ComputeLocalMemSize(V)); // Select the search method (full search or a random fraction) if (args.fraction != 0.0 && args.fraction != 1.0) { const auto new_size = static_cast(configurations.size() * args.fraction); auto rng = std::default_random_engine{}; std::shuffle(std::begin(configurations), std::end(configurations), rng); configurations.resize(new_size); } // First runs a reference example to compare against try { // Sets the input for (const auto id : settings.inputs) { device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); } // Compiles the kernel auto compiler_options = std::vector(); const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name, device, context, compiler_options, 0); auto kernel = Kernel(program, settings.kernel_name); SetArguments(V, kernel, args, device_buffers); // Runs the kernel const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, settings.global_size_ref, settings.local_size_ref, true); if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); } // Saves the result for (const auto id : settings.outputs) { device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]); } } catch (...) { const auto status_code = DispatchExceptionCatchAll(true); return status_code; } // Starts the tuning process auto results = std::vector(); for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) { try { auto configuration = configurations[config_id]; // Sets the input for (const auto id : settings.inputs) { device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); } // Sets the thread configuration const auto global = SetThreadConfiguration(configuration, settings.global_size, settings.mul_global, settings.div_global); const auto local = SetThreadConfiguration(configuration, settings.local_size, settings.mul_local, settings.div_local); // Sets the parameters for this configuration auto kernel_source = std::string{""}; for (const auto ¶meter : configuration) { kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n"; } kernel_source += settings.sources; // Compiles the kernel auto compiler_options = std::vector(); const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name, device, context, compiler_options, 0, true); auto kernel = Kernel(program, settings.kernel_name); // Runs the kernel SetArguments(V, kernel, args, device_buffers); const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local, true); // Kernel run was not successful if (time_ms == -1.0) { continue; } // Compares the results auto l2_error = 0.0; for (const auto id : settings.outputs) { device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]); for (auto index = size_t{0}; index(buffer_sizes[id]); if (std::isnan(l2_error) || l2_error > 1.0e-4) { throw std::runtime_error("L2 error too large"); } } results.push_back(TuningResult{settings.kernel_name, time_ms, configuration}); } catch (...) { } } // Completed the tuning process if (results.size() == 0) { return StatusCode::kUnexpectedError; } // Computes the best results auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; const auto best_configuration = std::min_element(results.begin(), results.end(), comparison); const auto best_time_ms = best_configuration->score; if (best_time_ms == 0.0) { return StatusCode::kUnexpectedError; } // Stores the best parameters for (const auto& config : best_configuration->config) { parameters[config.first] = config.second; } return StatusCode::kSuccess; } // Compiles the above function template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map&); template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map&); template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map&); template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map&); template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const ComputeLocalMemSizeFunc ComputeLocalMemSize, const SetArgumentsFunc SetArguments, std::unordered_map&); // ================================================================================================= } // namespace clblast