From 76d2b7f0b6fecb81ddc6912f5aae3e1ee9b89b29 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 19 Nov 2017 12:59:52 +0100 Subject: Revived the GEMM routine tuner; minor formatting changes --- src/tuning/routines/xgemm.cpp | 39 ++++++++++++++++++++++++++------------- src/tuning/tuning.cpp | 2 +- src/tuning/tuning.hpp | 2 +- src/utilities/timing.cpp | 4 ++-- src/utilities/timing.hpp | 10 +++++++--- 5 files changed, 37 insertions(+), 20 deletions(-) (limited to 'src') diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp index a880c97e..cd22137a 100644 --- a/src/tuning/routines/xgemm.cpp +++ b/src/tuning/routines/xgemm.cpp @@ -18,7 +18,7 @@ #include #include "utilities/utilities.hpp" -#include "utilities/timing.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= @@ -68,7 +68,7 @@ void TuneXgemm(int argc, char* argv[]) { const auto platform = Platform(platform_id); const auto device = Device(platform, device_id); if (!PrecisionSupported(device)) { - printf("* Unsupported precision, skipping this tuning run\n\n"); + printf("* Unsupported precision, skipping this tuning run\n"); return; } const auto context = Context(device); @@ -81,18 +81,18 @@ void TuneXgemm(int argc, char* argv[]) { auto buffers = std::vector>{a_mat, b_mat, c_mat}; // In-direct version - printf("[----------] Testing the in-direct GEMM routine for m=n=k\n"); + printf("\n* Testing the in-direct GEMM routine for m=n=k\n"); ForceSelectIndirectFrom(0, device); const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine); // Direct version - printf("[----------] Testing the direct GEMM routine for m=n=k\n"); + printf("\n* Testing the direct GEMM routine for m=n=k\n"); ForceSelectIndirectFrom(to * to * to + 1, device); const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine); // Determining final score and best kernel selection point assert(indirect.size() == direct.size()); - printf("[----------] Collecting results\n"); + printf("\n* Collecting results\n"); auto ratios = std::vector(indirect.size()); for (auto i = size_t{0}; i < indirect.size(); ++i) { ratios[i] = indirect[i].second / direct[i].second; @@ -104,42 +104,55 @@ void TuneXgemm(int argc, char* argv[]) { for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); } const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones const auto relative_score = static_cast(score) / static_cast(scores.size() - 1); + auto tuning_results = Configuration(); + tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = indirect[i].first; + tuning_results["PRECISION"] = static_cast(precision); scores[i] = TuningResult{ "gemm_kernel_selection", (relative_score * relative_score) * 100 + epsilon, // squared for proper default computation - TuningParameters{ - TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first}, - TuningParameter{"PRECISION", static_cast(precision)} - } + tuning_results }; } // Displaying results - printf("[ -------> ] value indirect direct score (lowest means best switching point)\n"); + printf("| value | indirect | direct | score | (lowest score == best switching point)\n"); + printf("x---------x-------------x-------------x----------x\n"); for (auto i = size_t{0}; i < indirect.size(); ++i) { assert(indirect[i].first == direct[i].first); const auto value = indirect[i].first; if (indirect[i].second != -1 && direct[i].second != -1) { const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6); const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6); - printf("[ -------> ] %7zu %8.2lf %8.2lf %8.2lf\n", + printf("| %7zu | %8.2lf ms | %8.2lf ms | %8.3lf |\n", value, gflops_indirect, gflops_direct, scores[i].score); } } + printf("x---------x-------------x-------------x----------x\n"); + printf("\n"); + + // Computes the best switching point + auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; + const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison); + const auto best_switching_point = best_configuration->config["XGEMM_MIN_INDIRECT_SIZE"]; + const auto best_string = "XGEMM_MIN_INDIRECT_SIZE=" + ToString(best_switching_point); // Outputs the results as JSON to disk, including some meta-data const auto precision_string = std::to_string(static_cast(precision)); auto metadata = std::vector>{ {"kernel_family", "gemm_routine"}, + {"precision", precision_string}, {"arg_from", ToString(from)}, {"arg_to", ToString(to)}, {"arg_step", ToString(step)}, - {"precision", precision_string}, + {"best_kernel", best_configuration->name}, + {"best_time", ToString(best_configuration->score)}, + {"best_parameters", best_string} }; PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json", device, platform, metadata, scores); - printf("[ STATUS ] All done\n"); + printf("* Completed tuning process\n"); + printf("\n"); } // ================================================================================================= diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp index bd8337b4..5db7d2fb 100644 --- a/src/tuning/tuning.cpp +++ b/src/tuning/tuning.cpp @@ -77,7 +77,7 @@ void PrintTimingsToFileAsJSON(const std::string &filename, void print_separator(const size_t parameters_size) { printf("x------x-------x"); for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); } - printf("-x----------x------------x--------x-------------------x\n"); + printf("-x----------x--------------x--------x-------------------x\n"); } // ================================================================================================= diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index 41f394c1..95464001 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -211,7 +211,7 @@ void Tuner(int argc, char* argv[]) { printf("\n"); printf("| ID | total |"); for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); } - printf("param | compiles | time | %6s | status |\n", settings.performance_unit.c_str()); + printf("param | compiles | time | %6s | status |\n", settings.performance_unit.c_str()); print_separator(settings.parameters.size()); // First runs a reference example to compare against diff --git a/src/utilities/timing.cpp b/src/utilities/timing.cpp index 188e4487..af6a8ff2 100644 --- a/src/utilities/timing.cpp +++ b/src/utilities/timing.cpp @@ -65,12 +65,12 @@ double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Dev std::vector global, const std::vector &local) { try { const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local); - printf(" %7.2lf ms |", time_ms); + printf(" %9.2lf ms |", time_ms); return time_ms; } catch (...) { const auto status_code = DispatchExceptionCatchAll(true); - printf(" error %3d |", static_cast(status_code)); + printf(" error %-5d |", static_cast(status_code)); return -1.0; // invalid } } diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp index e8040058..a66aba4b 100644 --- a/src/utilities/timing.hpp +++ b/src/utilities/timing.hpp @@ -55,19 +55,23 @@ std::vector TimeRoutine(const size_t from, const size_t to, const size_t const size_t num_runs, const Queue& queue, const std::vector>& buffers, F const &routine) { auto timings = std::vector(); + printf("| value | time |\n"); + printf("x--------x--------------x\n"); for (auto value = from; value < to; value += step) { - printf("[ RUN ] Running with value %zu\n", value); + printf("| %6zu |", value); try { const auto FunctionToTune = [&]() { routine(value, queue, buffers); }; const auto time_ms = TimeFunction(num_runs, FunctionToTune); - printf("[ OK ] Took %.2lf ms\n", time_ms); + printf(" %9.2lf ms |\n", time_ms); timings.push_back({value, time_ms}); } catch (...) { - printf("[ ERROR ] Exception caught\n"); + const auto status_code = DispatchExceptionCatchAll(true); + printf(" error %-5d |\n", static_cast(status_code)); timings.push_back({value, -1.0}); // invalid } } + printf("x--------x--------------x\n"); return timings; } -- cgit v1.2.3