diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2018-01-20 10:19:28 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-01-20 10:19:28 +0100 |
commit | b2c946c517736b3e33a5a651a907c7fb10d646f6 (patch) | |
tree | 8253935a3b251cda5ffaf6b1d7e78dde13610cc5 | |
parent | b35e3d1e5326cdc257daa170eb243800616cfc26 (diff) | |
parent | c3f9371d16a66fa28906a3be9925a646e72ea471 (diff) |
Merge pull request #244 from CNugteren/kernel_selection_batched_gemm
Kernel selection for batched GEMM
-rw-r--r-- | CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/routines/levelx/xgemmbatched.cpp | 2 | ||||
-rw-r--r-- | src/routines/levelx/xgemmstridedbatched.cpp | 2 | ||||
-rw-r--r-- | src/tuning/routines/routine_tuner.hpp | 136 | ||||
-rw-r--r-- | src/tuning/routines/xgemm.cpp | 141 |
5 files changed, 189 insertions, 93 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 18254658..a6aae958 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -388,6 +388,7 @@ if(TUNERS) src/utilities/utilities.hpp src/tuning/configurations.hpp src/tuning/tuning.hpp + src/tuning/routines/routine_tuner.hpp src/kernel_preprocessor.hpp) set(TUNERS_COMMON ${TUNERS_COMMON} ${TUNERS_HEADERS}) diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp index 1c0953e8..6a089b8a 100644 --- a/src/routines/levelx/xgemmbatched.cpp +++ b/src/routines/levelx/xgemmbatched.cpp @@ -96,7 +96,7 @@ void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_trans } // Selects which version of the batched GEMM to run - const auto do_gemm_direct = true; + const auto do_gemm_direct = Xgemm<T>::UseDirectKernel(m, n, k, db_["XGEMM_MIN_INDIRECT_SIZE"]); if (do_gemm_direct) { // single generic kernel BatchedGemmDirect(m, n, k, alphas_device, a_buffer, a_offsets_int, a_ld, b_buffer, b_offsets_int, b_ld, diff --git a/src/routines/levelx/xgemmstridedbatched.cpp b/src/routines/levelx/xgemmstridedbatched.cpp index affbceee..6165a396 100644 --- a/src/routines/levelx/xgemmstridedbatched.cpp +++ b/src/routines/levelx/xgemmstridedbatched.cpp @@ -76,7 +76,7 @@ void XgemmStridedBatched<T>::DoGemmStridedBatched(const Layout layout, const Tra } // Selects which version of the batched GEMM to run - const auto do_gemm_direct = true; + const auto do_gemm_direct = Xgemm<T>::UseDirectKernel(m, n, k, db_["XGEMM_MIN_INDIRECT_SIZE"]);; if (do_gemm_direct) { // single generic kernel BatchedGemmDirect(m, n, k, alpha, a_buffer, a_offset, a_ld, a_stride, diff --git a/src/tuning/routines/routine_tuner.hpp b/src/tuning/routines/routine_tuner.hpp new file mode 100644 index 00000000..2aa0b3ce --- /dev/null +++ b/src/tuning/routines/routine_tuner.hpp @@ -0,0 +1,136 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the part of the auto-tuner for tuning entire routines (i.e. switching +// between direct and in-direct GEMM kernels) +// +// ================================================================================================= + +#ifndef CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_ +#define CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_ + +#include <exception> +#include <string> +#include <vector> +#include <assert.h> + +#include "utilities/utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +template <typename T> +void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device, + const std::string &tuner_name, const std::string& parameter_name) { + const auto override_status = OverrideParameters(device(), tuner_name, PrecisionValue<T>(), + {{parameter_name, minimum_size}}); + if (override_status != StatusCode::kSuccess) { + throw RuntimeError("OverrideParameters failed with status " + ToString(override_status)); + } +} + +// Computes the best switching point +TuningResult GetBestResult(const std::vector<TuningResult>& scores) { + auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; + const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison); + return *best_configuration; +} + +// Tunes at kernel-level +template <typename T, typename F> +void TuneKernelSelection(const Platform& platform, const Device& device, const Context& context, + Queue& queue, const Precision precision, F const &routine, + const size_t from, const size_t to, const size_t step, const size_t batch_count, + const size_t num_runs, const std::string &name, const std::string &tuner_name, + const std::string &family_name, const std::string& parameter_name) { + + // Buffers + auto buffers = std::vector<Buffer<T>>{ + Buffer<T>(context, to * to * batch_count), + Buffer<T>(context, to * to * batch_count), + Buffer<T>(context, to * to * batch_count) + }; + + // In-direct version + printf("\n* Testing the in-direct %s routine for m=n=k\n", name.c_str()); + ForceSelectIndirectFrom<T>(0, device, tuner_name, parameter_name); + const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, routine); + + // Direct version + printf("\n* Testing the direct %s routine for m=n=k\n", name.c_str()); + ForceSelectIndirectFrom<T>(batch_count * to + 1, device, tuner_name, parameter_name); + const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, routine); + + // Determining final score and best kernel selection point + assert(indirect.size() == direct.size()); + printf("\n* Collecting results\n"); + auto ratios = std::vector<double>(indirect.size()); + for (auto i = size_t{0}; i < indirect.size(); ++i) { + ratios[i] = indirect[i].second / direct[i].second; + } + auto scores = std::vector<TuningResult>(ratios.size()); + for (auto i = size_t{0}; i < scores.size(); ++i) { + auto score = 0; + for (auto j = size_t{0}; j < i; ++j) { score += (ratios[j] <= 1.0); } + for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); } + const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones + const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1); + auto tuning_results = Configuration(); + tuning_results[parameter_name] = indirect[i].first; + tuning_results["PRECISION"] = static_cast<size_t>(precision); + scores[i] = TuningResult{ + name + "_kernel_selection", + (relative_score * relative_score) * 100 + epsilon, // squared for proper default computation + tuning_results + }; + } + + // Displaying results + printf("| || %12s indirect || %12s direct || |\n", name.c_str(), name.c_str()); + printf("| m=n=k || ms | GFLOPS || ms | GFLOPS || score | (lowest score == best switching point)\n"); + printf("x---------xx----------x------------xx----------x----------xx----------x\n"); + for (auto i = size_t{0}; i < indirect.size(); ++i) { + assert(indirect[i].first == direct[i].first); + const auto value = indirect[i].first; + if (indirect[i].second != -1 && direct[i].second != -1) { + const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6); + const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6); + printf("| %7zu || %8.2lf | %10.1lf || %8.2lf | %8.1lf || %8.3lf |\n", + value, indirect[i].second, gflops_indirect, direct[i].second, gflops_direct, scores[i].score); + } + } + printf("x---------xx----------x------------xx----------x----------xx----------x\n"); + printf("\n"); + + const auto best_result = GetBestResult(scores); + const auto best_switching_point = best_result.config.at(parameter_name); + const auto best_string = parameter_name + "=" + ToString(best_switching_point); + + // Outputs the results as JSON to disk, including some meta-data + const auto precision_string = std::to_string(static_cast<size_t>(precision)); + auto metadata = std::vector<std::pair<std::string,std::string>>{ + {"kernel_family", family_name}, + {"precision", precision_string}, + {"arg_from", ToString(from)}, + {"arg_to", ToString(to)}, + {"arg_step", ToString(step)}, + {"best_kernel", best_result.name}, + {"best_time", ToString(best_result.score)}, + {"best_parameters", best_string} + }; + PrintTimingsToFileAsJSON("clblast_" + family_name + "_" + precision_string + ".json", + device, platform, metadata, scores); +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TUNING_ROUTINES_ROUTINE_TUNER_H_ +#endif diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp index 83db6104..0721ad7c 100644 --- a/src/tuning/routines/xgemm.cpp +++ b/src/tuning/routines/xgemm.cpp @@ -15,10 +15,9 @@ #include <exception> #include <string> #include <vector> -#include <assert.h> #include "utilities/utilities.hpp" -#include "tuning/tuning.hpp" +#include "tuning/routines/routine_tuner.hpp" namespace clblast { // ================================================================================================= @@ -40,15 +39,48 @@ void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Bu clReleaseEvent(event); } -template <typename T> -void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device) { - const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(), - {{"XGEMM_MIN_INDIRECT_SIZE", minimum_size}}); - if (override_status != StatusCode::kSuccess) { - throw RuntimeError("OverrideParameters failed with status " + ToString(override_status)); +template <typename T, size_t batch_count> +void RunGemmBatchedRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) { + auto offsets = std::vector<size_t>(batch_count); + auto factors = std::vector<T>(batch_count); + for (auto i = size_t{0}; i < batch_count; ++i) { + offsets[i] = batch_count * value; + factors[i] = ConstantOne<T>(); } + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = GemmBatched(Layout::kRowMajor, Transpose::kNo, Transpose::kNo, + value, value, value, factors.data(), + buffers[0](), offsets.data(), value, + buffers[1](), offsets.data(), value, factors.data(), + buffers[2](), offsets.data(), value, batch_count, + &queue_plain, &event); + if (status != StatusCode::kSuccess) { + throw RuntimeError("GemmBatched failed with status " + ToString(status)); + } + clWaitForEvents(1, &event); + clReleaseEvent(event); } +template <typename T, size_t batch_count> +void RunGemmStridedBatchedRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = GemmStridedBatched(Layout::kRowMajor, Transpose::kNo, Transpose::kNo, + value, value, value, ConstantOne<T>(), + buffers[0](), 0, value, value * value, + buffers[1](), 0, value, value * value, ConstantOne<T>(), + buffers[2](), 0, value, value * value, batch_count, + &queue_plain, &event); + if (status != StatusCode::kSuccess) { + throw RuntimeError("Gemm failed with status " + ToString(status)); + } + clWaitForEvents(1, &event); + clReleaseEvent(event); +} + +// ================================================================================================= + template <typename T> void TuneXgemm(int argc, char* argv[]) { auto command_line_args = RetrieveCommandLineArguments(argc, argv); @@ -59,11 +91,6 @@ void TuneXgemm(int argc, char* argv[]) { const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10}); fprintf(stdout, "%s\n", help.c_str()); - // Values for m, n, and k - const auto from = size_t{64}; - const auto to = size_t{2048}; - const auto step = size_t{64}; - // OpenCL initialisation const auto platform = Platform(platform_id); const auto device = Device(platform, device_id); @@ -74,84 +101,16 @@ void TuneXgemm(int argc, char* argv[]) { const auto context = Context(device); auto queue = Queue(context, device); - // Buffers - auto buffers = std::vector<Buffer<T>>{ - Buffer<T>(context, to * to), - Buffer<T>(context, to * to), - Buffer<T>(context, to * to) - }; - - // In-direct version - printf("\n* Testing the in-direct GEMM routine for m=n=k\n"); - ForceSelectIndirectFrom<T>(0, device); - const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>); - - // Direct version - printf("\n* Testing the direct GEMM routine for m=n=k\n"); - ForceSelectIndirectFrom<T>(to + 1, device); - const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>); - - // Determining final score and best kernel selection point - assert(indirect.size() == direct.size()); - printf("\n* Collecting results\n"); - auto ratios = std::vector<double>(indirect.size()); - for (auto i = size_t{0}; i < indirect.size(); ++i) { - ratios[i] = indirect[i].second / direct[i].second; - } - auto scores = std::vector<TuningResult>(ratios.size()); - for (auto i = size_t{0}; i < scores.size(); ++i) { - auto score = 0; - for (auto j = size_t{0}; j < i; ++j) { score += (ratios[j] <= 1.0); } - for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); } - const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones - const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1); - auto tuning_results = Configuration(); - tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = indirect[i].first; - tuning_results["PRECISION"] = static_cast<size_t>(precision); - scores[i] = TuningResult{ - "gemm_kernel_selection", - (relative_score * relative_score) * 100 + epsilon, // squared for proper default computation - tuning_results - }; - } - - // Displaying results - printf("| || indirect GEMM || direct GEMM || |\n"); - printf("| m=n=k || ms | GFLOPS || ms | GFLOPS || score | (lowest score == best switching point)\n"); - printf("x---------xx--------x----------xx--------x----------xx----------x\n"); - for (auto i = size_t{0}; i < indirect.size(); ++i) { - assert(indirect[i].first == direct[i].first); - const auto value = indirect[i].first; - if (indirect[i].second != -1 && direct[i].second != -1) { - const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6); - const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6); - printf("| %7zu || %6.2lf | %8.1lf || %6.2lf | %8.1lf || %8.3lf |\n", - value, indirect[i].second, gflops_indirect, direct[i].second, gflops_direct, scores[i].score); - } - } - printf("x---------xx--------x----------xx--------x----------xx----------x\n"); - printf("\n"); - - // Computes the best switching point - auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; - const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison); - const auto best_switching_point = best_configuration->config["XGEMM_MIN_INDIRECT_SIZE"]; - const auto best_string = "XGEMM_MIN_INDIRECT_SIZE=" + ToString(best_switching_point); - - // Outputs the results as JSON to disk, including some meta-data - const auto precision_string = std::to_string(static_cast<size_t>(precision)); - auto metadata = std::vector<std::pair<std::string,std::string>>{ - {"kernel_family", "gemm_routine"}, - {"precision", precision_string}, - {"arg_from", ToString(from)}, - {"arg_to", ToString(to)}, - {"arg_step", ToString(step)}, - {"best_kernel", best_configuration->name}, - {"best_time", ToString(best_configuration->score)}, - {"best_parameters", best_string} - }; - PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json", - device, platform, metadata, scores); + // Run the tuners for the XGEMM routines + TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>, + 64, 2048, 64, 1, num_runs, + "gemm", "GemmRoutine", "gemm_routine", "XGEMM_MIN_INDIRECT_SIZE"); + //TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmBatchedRoutine<T, 30>, + // 16, 128, 32, 30, num_runs, + // "gemmbatched", "GemmRoutine", "gemm_routine_2", "XGEMMBATCHED_MIN_INDIRECT_SIZE"); + //TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmStridedBatchedRoutine<T, 30>, + // 16, 128, 32, 30, num_runs, + // "gemmstridedbatched", "GemmRoutine", "gemm_routine_3", "XGEMMSTRIDEDBATCHED_MIN_INDIRECT_SIZE"); printf("* Completed tuning process\n"); printf("\n"); |