summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG1
-rw-r--r--src/tuning/routines/xgemm.cpp39
-rw-r--r--src/tuning/tuning.cpp2
-rw-r--r--src/tuning/tuning.hpp2
-rw-r--r--src/utilities/timing.cpp4
-rw-r--r--src/utilities/timing.hpp10
6 files changed, 38 insertions, 20 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 200fdc53..95508951 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,6 @@
Development (next version)
+- Re-designed and integrated the auto-tuner, no more dependency on CLTune
- Added tuned parameters for various devices (see README)
Version 1.2.0
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index a880c97e..cd22137a 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -18,7 +18,7 @@
#include <assert.h>
#include "utilities/utilities.hpp"
-#include "utilities/timing.hpp"
+#include "tuning/tuning.hpp"
namespace clblast {
// =================================================================================================
@@ -68,7 +68,7 @@ void TuneXgemm(int argc, char* argv[]) {
const auto platform = Platform(platform_id);
const auto device = Device(platform, device_id);
if (!PrecisionSupported<T>(device)) {
- printf("* Unsupported precision, skipping this tuning run\n\n");
+ printf("* Unsupported precision, skipping this tuning run\n");
return;
}
const auto context = Context(device);
@@ -81,18 +81,18 @@ void TuneXgemm(int argc, char* argv[]) {
auto buffers = std::vector<Buffer<T>>{a_mat, b_mat, c_mat};
// In-direct version
- printf("[----------] Testing the in-direct GEMM routine for m=n=k\n");
+ printf("\n* Testing the in-direct GEMM routine for m=n=k\n");
ForceSelectIndirectFrom<T>(0, device);
const auto indirect = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
// Direct version
- printf("[----------] Testing the direct GEMM routine for m=n=k\n");
+ printf("\n* Testing the direct GEMM routine for m=n=k\n");
ForceSelectIndirectFrom<T>(to * to * to + 1, device);
const auto direct = TimeRoutine(from, to, step, num_runs, queue, buffers, RunGemmRoutine<T>);
// Determining final score and best kernel selection point
assert(indirect.size() == direct.size());
- printf("[----------] Collecting results\n");
+ printf("\n* Collecting results\n");
auto ratios = std::vector<double>(indirect.size());
for (auto i = size_t{0}; i < indirect.size(); ++i) {
ratios[i] = indirect[i].second / direct[i].second;
@@ -104,42 +104,55 @@ void TuneXgemm(int argc, char* argv[]) {
for (auto j = i + 1; j < ratios.size(); ++j) { score += (ratios[j] > 1.0); }
const auto epsilon = (scores.size() - i) / 1e3; // favour later results over earlier ones
const auto relative_score = static_cast<double>(score) / static_cast<double>(scores.size() - 1);
+ auto tuning_results = Configuration();
+ tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = indirect[i].first;
+ tuning_results["PRECISION"] = static_cast<size_t>(precision);
scores[i] = TuningResult{
"gemm_kernel_selection",
(relative_score * relative_score) * 100 + epsilon, // squared for proper default computation
- TuningParameters{
- TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first},
- TuningParameter{"PRECISION", static_cast<size_t>(precision)}
- }
+ tuning_results
};
}
// Displaying results
- printf("[ -------> ] value indirect direct score (lowest means best switching point)\n");
+ printf("| value | indirect | direct | score | (lowest score == best switching point)\n");
+ printf("x---------x-------------x-------------x----------x\n");
for (auto i = size_t{0}; i < indirect.size(); ++i) {
assert(indirect[i].first == direct[i].first);
const auto value = indirect[i].first;
if (indirect[i].second != -1 && direct[i].second != -1) {
const auto gflops_indirect = (2 * value * value * value) / (indirect[i].second * 1.0e6);
const auto gflops_direct = (2 * value * value * value) / (direct[i].second * 1.0e6);
- printf("[ -------> ] %7zu %8.2lf %8.2lf %8.2lf\n",
+ printf("| %7zu | %8.2lf ms | %8.2lf ms | %8.3lf |\n",
value, gflops_indirect, gflops_direct, scores[i].score);
}
}
+ printf("x---------x-------------x-------------x----------x\n");
+ printf("\n");
+
+ // Computes the best switching point
+ auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
+ const auto best_configuration = std::min_element(scores.begin(), scores.end(), comparison);
+ const auto best_switching_point = best_configuration->config["XGEMM_MIN_INDIRECT_SIZE"];
+ const auto best_string = "XGEMM_MIN_INDIRECT_SIZE=" + ToString(best_switching_point);
// Outputs the results as JSON to disk, including some meta-data
const auto precision_string = std::to_string(static_cast<size_t>(precision));
auto metadata = std::vector<std::pair<std::string,std::string>>{
{"kernel_family", "gemm_routine"},
+ {"precision", precision_string},
{"arg_from", ToString(from)},
{"arg_to", ToString(to)},
{"arg_step", ToString(step)},
- {"precision", precision_string},
+ {"best_kernel", best_configuration->name},
+ {"best_time", ToString(best_configuration->score)},
+ {"best_parameters", best_string}
};
PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json",
device, platform, metadata, scores);
- printf("[ STATUS ] All done\n");
+ printf("* Completed tuning process\n");
+ printf("\n");
}
// =================================================================================================
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index bd8337b4..5db7d2fb 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -77,7 +77,7 @@ void PrintTimingsToFileAsJSON(const std::string &filename,
void print_separator(const size_t parameters_size) {
printf("x------x-------x");
for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); }
- printf("-x----------x------------x--------x-------------------x\n");
+ printf("-x----------x--------------x--------x-------------------x\n");
}
// =================================================================================================
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 41f394c1..95464001 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -211,7 +211,7 @@ void Tuner(int argc, char* argv[]) {
printf("\n");
printf("| ID | total |");
for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); }
- printf("param | compiles | time | %6s | status |\n", settings.performance_unit.c_str());
+ printf("param | compiles | time | %6s | status |\n", settings.performance_unit.c_str());
print_separator(settings.parameters.size());
// First runs a reference example to compare against
diff --git a/src/utilities/timing.cpp b/src/utilities/timing.cpp
index 188e4487..af6a8ff2 100644
--- a/src/utilities/timing.cpp
+++ b/src/utilities/timing.cpp
@@ -65,12 +65,12 @@ double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Dev
std::vector<size_t> global, const std::vector<size_t> &local) {
try {
const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local);
- printf(" %7.2lf ms |", time_ms);
+ printf(" %9.2lf ms |", time_ms);
return time_ms;
}
catch (...) {
const auto status_code = DispatchExceptionCatchAll(true);
- printf(" error %3d |", static_cast<int>(status_code));
+ printf(" error %-5d |", static_cast<int>(status_code));
return -1.0; // invalid
}
}
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index e8040058..a66aba4b 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -55,19 +55,23 @@ std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t
const size_t num_runs, const Queue& queue,
const std::vector<Buffer<T>>& buffers, F const &routine) {
auto timings = std::vector<Timing>();
+ printf("| value | time |\n");
+ printf("x--------x--------------x\n");
for (auto value = from; value < to; value += step) {
- printf("[ RUN ] Running with value %zu\n", value);
+ printf("| %6zu |", value);
try {
const auto FunctionToTune = [&]() { routine(value, queue, buffers); };
const auto time_ms = TimeFunction(num_runs, FunctionToTune);
- printf("[ OK ] Took %.2lf ms\n", time_ms);
+ printf(" %9.2lf ms |\n", time_ms);
timings.push_back({value, time_ms});
}
catch (...) {
- printf("[ ERROR ] Exception caught\n");
+ const auto status_code = DispatchExceptionCatchAll(true);
+ printf(" error %-5d |\n", static_cast<int>(status_code));
timings.push_back({value, -1.0}); // invalid
}
}
+ printf("x--------x--------------x\n");
return timings;
}