diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/routines/levelx/xinvert.cpp | 5 | ||||
-rw-r--r-- | src/tuning/routines/xgemm.cpp | 2 | ||||
-rw-r--r-- | src/tuning/routines/xtrsv.cpp | 142 | ||||
-rw-r--r-- | src/utilities/timing.hpp | 2 |
4 files changed, 147 insertions, 4 deletions
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp index 5c21d5ce..5fbc5fe5 100644 --- a/src/routines/levelx/xinvert.cpp +++ b/src/routines/levelx/xinvert.cpp @@ -27,8 +27,9 @@ namespace clblast { template <typename T> Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, { - #include "../../kernels/level3/level3.opencl" - #include "../../kernels/level3/invert_diagonal_blocks.opencl" + #include "../../kernels/level3/level3.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 + #include "../../kernels/level3/invert_diagonal_blocks.opencl" }) { } diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp index cf750519..83db6104 100644 --- a/src/tuning/routines/xgemm.cpp +++ b/src/tuning/routines/xgemm.cpp @@ -72,7 +72,7 @@ void TuneXgemm(int argc, char* argv[]) { return; } const auto context = Context(device); - const auto queue = Queue(context, device); + auto queue = Queue(context, device); // Buffers auto buffers = std::vector<Buffer<T>>{ diff --git a/src/tuning/routines/xtrsv.cpp b/src/tuning/routines/xtrsv.cpp new file mode 100644 index 00000000..9e8f26fa --- /dev/null +++ b/src/tuning/routines/xtrsv.cpp @@ -0,0 +1,142 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file tunes the Xtrsv routine at a high-level: choosing an appropriate block size +// +// ================================================================================================= + +#include <exception> +#include <string> +#include <vector> +#include <limits> + +#include "utilities/utilities.hpp" +#include "tuning/tuning.hpp" +#include "routines/routines.hpp" + +namespace clblast { +// ================================================================================================= + +constexpr auto size = size_t{1024}; // 'n' argument + +template <typename T> +void SetBlockSize(const size_t value, const Device &device) { + const auto override_status = OverrideParameters(device(), "TrsvRoutine", PrecisionValue<T>(), + {{"TRSV_BLOCK_SIZE", value}}); + if (override_status != StatusCode::kSuccess) { + throw RuntimeError("OverrideParameters failed with status " + ToString(override_status)); + } +} + +template <typename T> +void RunTrsvRoutine(const size_t block_size, Queue& queue, const std::vector<Buffer<T>>& buffers) { + SetBlockSize<T>(block_size, queue.GetDevice()); + auto event = cl_event{}; + auto routine = Xtrsv<T>(queue, nullptr); + routine.DoTrsv(Layout::kRowMajor, Triangle::kLower, Transpose::kNo, Diagonal::kNonUnit, + size, + buffers[0], 0, size, // A matrix + buffers[1], 0, 1); // X vector + clWaitForEvents(1, &event); + clReleaseEvent(event); +} + +template <typename T> +void TuneXtrsv(int argc, char* argv[]) { + auto command_line_args = RetrieveCommandLineArguments(argc, argv); + auto help = std::string{"* Options given/available:\n"}; + const auto platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); + const auto device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); + const auto precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle); + const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10}); + fprintf(stdout, "%s\n", help.c_str()); + + // Values for the block size + const auto from = size_t{8}; + const auto to = size_t{64 + 1}; + const auto step = size_t{8}; + + // OpenCL initialisation + const auto platform = Platform(platform_id); + const auto device = Device(platform, device_id); + if (!PrecisionSupported<T>(device)) { + printf("* Unsupported precision, skipping this tuning run\n"); + return; + } + const auto context = Context(device); + auto queue = Queue(context, device); + + // Buffers + auto buffers = std::vector<Buffer<T>>{ + Buffer<T>(context, size * size), + Buffer<T>(context, size) + }; + + // Performance testing + const auto results = TimeRoutine(from, to, step, num_runs, queue, buffers, RunTrsvRoutine<T>); + + // Stores the results in the expected format + auto scores = std::vector<TuningResult>(); + for (const auto &result : results) { + if (result.second != -1) { + auto tuning_results = Configuration(); + tuning_results["TRSV_BLOCK_SIZE"] = result.first; + tuning_results["PRECISION"] = static_cast<size_t>(precision); + scores.emplace_back(TuningResult{"trsv_routine", result.second, tuning_results}); + } + } + + // Computes the best result + auto best_time = std::numeric_limits<double>::max(); + auto best_value = size_t{0}; + for (const auto &result : results) { + if (result.second != -1 && result.second < best_time) { + best_time = result.second; + best_value = result.first; + } + } + const auto best_string = "TRSV_BLOCK_SIZE=" + ToString(best_value); + + // Outputs the results as JSON to disk, including some meta-data + const auto precision_string = std::to_string(static_cast<size_t>(precision)); + auto metadata = std::vector<std::pair<std::string,std::string>>{ + {"kernel_family", "trsv_routine"}, + {"precision", precision_string}, + {"arg_n", ToString(size)}, + {"best_kernel", "trsv_routine"}, + {"best_time", ToString(best_time)}, + {"best_parameters", best_string} + }; + PrintTimingsToFileAsJSON("clblast_routine_xtrsv_" + precision_string + ".json", + device, platform, metadata, scores); + + printf("* Completed tuning process\n"); + printf("\n"); +} + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); + switch(clblast::GetPrecision(command_line_args)) { + case clblast::Precision::kSingle: clblast::TuneXtrsv<float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::TuneXtrsv<double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::TuneXtrsv<float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::TuneXtrsv<double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp index a66aba4b..c167cd5f 100644 --- a/src/utilities/timing.hpp +++ b/src/utilities/timing.hpp @@ -52,7 +52,7 @@ using Timing = std::pair<size_t, double>; template <typename T, typename F> std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t step, - const size_t num_runs, const Queue& queue, + const size_t num_runs, Queue& queue, const std::vector<Buffer<T>>& buffers, F const &routine) { auto timings = std::vector<Timing>(); printf("| value | time |\n"); |