diff options
Diffstat (limited to 'src/tuning/routines/xgemm.cpp')
-rw-r--r-- | src/tuning/routines/xgemm.cpp | 39 |
1 files changed, 26 insertions, 13 deletions
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp index a880c97e..cd22137a 100644 --- a/src/tuning/routines/xgemm.cpp +++ b/src/tuning/routines/xgemm.cpp @@ -18,7 +18,7 @@ #include <assert.h> #include "utilities/utilities.hpp" -#include "utilities/timing.hpp" +#include "tuning/tuning.hpp" namespace clblast { // ================================================================================================= @@ -68,7 +68,7 @@ void TuneXgemm(int argc, char* argv[]) { const auto platform = Platform(platform_id); const auto device = Device(platform, device_id); if (!PrecisionSupported<T>(device)) { - printf("* Unsupported precision, skipping this tuning run\n\n"); + printf("* Unsupported precision, skipping this tuning run\n"); return; } const auto context = Context(device); @@ -81,18 +81,18 @@ void TuneXgemm(int argc, char* argv[]) { auto buffers = std::vector<Buffer<T>>{a_mat, b_mat, c_mat}; // In-direct version - printf("[----------] Testing the in-direct GEMM routine for m=n=k\n"); + printf("\n* Testing the in-direct GEMM routine for m=n=k\n"); ForceSelectIndirectFrom<T>(0, device); const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>); // Direct version - printf("[----------] Testing the direct GEMM routine for m=n=k\n"); + printf("\n* Testing the direct GEMM routine for m=n=k\n"); ForceSelectIndirectFrom<T>(to * to * to + 1, device); const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>); // Determining final score and best kernel selection point assert(indirect.size() == direct.size()); - printf("[----------] Collecting results\n"); + printf("\n* Collecting results\n"); auto ratios = std::vector<double>(indirect.size()); for (auto i = size_t{0}; i < indirect.size(); ++i) { ratios[i] = indirect[i].second / direct[i].second; @@ -104,42 +104,55 @@ void TuneXgemm(int argc, char* argv[]) { for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); } const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1); + auto tuning_results = Configuration(); + tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = indirect[i].first; + tuning_results["PRECISION"] = static_cast<size_t>(precision); scores[i] = TuningResult{ "gemm_kernel_selection", (relative_score * relative_score) * 100 + epsilon, // squared for proper default computation - TuningParameters{ - TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first}, - TuningParameter{"PRECISION", static_cast<size_t>(precision)} - } + tuning_results }; } // Displaying results - printf("[ -------> ] value indirect direct score (lowest means best switching point)\n"); + printf("| value | indirect | direct | score | (lowest score == best switching point)\n"); + printf("x---------x-------------x-------------x----------x\n"); for (auto i = size_t{0}; i < indirect.size(); ++i) { assert(indirect[i].first == direct[i].first); const auto value = indirect[i].first; if (indirect[i].second != -1 && direct[i].second != -1) { const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6); const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6); - printf("[ -------> ] %7zu %8.2lf %8.2lf %8.2lf\n", + printf("| %7zu | %8.2lf ms | %8.2lf ms | %8.3lf |\n", value, gflops_indirect, gflops_direct, scores[i].score); } } + printf("x---------x-------------x-------------x----------x\n"); + printf("\n"); + + // Computes the best switching point + auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; + const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison); + const auto best_switching_point = best_configuration->config["XGEMM_MIN_INDIRECT_SIZE"]; + const auto best_string = "XGEMM_MIN_INDIRECT_SIZE=" + ToString(best_switching_point); // Outputs the results as JSON to disk, including some meta-data const auto precision_string = std::to_string(static_cast<size_t>(precision)); auto metadata = std::vector<std::pair<std::string,std::string>>{ {"kernel_family", "gemm_routine"}, + {"precision", precision_string}, {"arg_from", ToString(from)}, {"arg_to", ToString(to)}, {"arg_step", ToString(step)}, - {"precision", precision_string}, + {"best_kernel", best_configuration->name}, + {"best_time", ToString(best_configuration->score)}, + {"best_parameters", best_string} }; PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json", device, platform, metadata, scores); - printf("[ STATUS ] All done\n"); + printf("* Completed tuning process\n"); + printf("\n"); } // ================================================================================================= |