summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-12-23 13:34:57 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-12-23 13:34:57 +0100
commitaa7db4f987360fe1956add9391c6e81aa61b75f3 (patch)
tree71411054266ebfa87d514bbb5235f80323b24af9
parent2b007450b99fbbc198f0688c9f75ae5e09ffe4fa (diff)
Added TRSV block-size tuner
-rw-r--r--CMakeLists.txt4
-rw-r--r--src/routines/levelx/xinvert.cpp5
-rw-r--r--src/tuning/routines/xgemm.cpp2
-rw-r--r--src/tuning/routines/xtrsv.cpp142
-rw-r--r--src/utilities/timing.hpp2
5 files changed, 149 insertions, 6 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9f7264f8..f83ba33c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,8 +196,8 @@ set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
xgemm xgemm_direct xgemv invert)
set(DATABASES copy pad padtranspose transpose xaxpy xdot
xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger
- gemm_routine)
-set(ROUTINE_TUNERS xgemm)
+ gemm_routine trsv_routine)
+set(ROUTINE_TUNERS xgemm xtrsv)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
index 5c21d5ce..5fbc5fe5 100644
--- a/src/routines/levelx/xinvert.cpp
+++ b/src/routines/levelx/xinvert.cpp
@@ -27,8 +27,9 @@ namespace clblast {
template <typename T>
Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
- #include "../../kernels/level3/level3.opencl"
- #include "../../kernels/level3/invert_diagonal_blocks.opencl"
+ #include "../../kernels/level3/level3.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
+ #include "../../kernels/level3/invert_diagonal_blocks.opencl"
}) {
}
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index cf750519..83db6104 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -72,7 +72,7 @@ void TuneXgemm(int argc, char* argv[]) {
return;
}
const auto context = Context(device);
- const auto queue = Queue(context, device);
+ auto queue = Queue(context, device);
// Buffers
auto buffers = std::vector<Buffer<T>>{
diff --git a/src/tuning/routines/xtrsv.cpp b/src/tuning/routines/xtrsv.cpp
new file mode 100644
index 00000000..9e8f26fa
--- /dev/null
+++ b/src/tuning/routines/xtrsv.cpp
@@ -0,0 +1,142 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file tunes the Xtrsv routine at a high-level: choosing an appropriate block size
+//
+// =================================================================================================
+
+#include <exception>
+#include <string>
+#include <vector>
+#include <limits>
+
+#include "utilities/utilities.hpp"
+#include "tuning/tuning.hpp"
+#include "routines/routines.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+constexpr auto size = size_t{1024}; // 'n' argument
+
+template <typename T>
+void SetBlockSize(const size_t value, const Device &device) {
+ const auto override_status = OverrideParameters(device(), "TrsvRoutine", PrecisionValue<T>(),
+ {{"TRSV_BLOCK_SIZE", value}});
+ if (override_status != StatusCode::kSuccess) {
+ throw RuntimeError("OverrideParameters failed with status " + ToString(override_status));
+ }
+}
+
+template <typename T>
+void RunTrsvRoutine(const size_t block_size, Queue& queue, const std::vector<Buffer<T>>& buffers) {
+ SetBlockSize<T>(block_size, queue.GetDevice());
+ auto event = cl_event{};
+ auto routine = Xtrsv<T>(queue, nullptr);
+ routine.DoTrsv(Layout::kRowMajor, Triangle::kLower, Transpose::kNo, Diagonal::kNonUnit,
+ size,
+ buffers[0], 0, size, // A matrix
+ buffers[1], 0, 1); // X vector
+ clWaitForEvents(1, &event);
+ clReleaseEvent(event);
+}
+
+template <typename T>
+void TuneXtrsv(int argc, char* argv[]) {
+ auto command_line_args = RetrieveCommandLineArguments(argc, argv);
+ auto help = std::string{"* Options given/available:\n"};
+ const auto platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}));
+ const auto device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
+ const auto precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle);
+ const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10});
+ fprintf(stdout, "%s\n", help.c_str());
+
+ // Values for the block size
+ const auto from = size_t{8};
+ const auto to = size_t{64 + 1};
+ const auto step = size_t{8};
+
+ // OpenCL initialisation
+ const auto platform = Platform(platform_id);
+ const auto device = Device(platform, device_id);
+ if (!PrecisionSupported<T>(device)) {
+ printf("* Unsupported precision, skipping this tuning run\n");
+ return;
+ }
+ const auto context = Context(device);
+ auto queue = Queue(context, device);
+
+ // Buffers
+ auto buffers = std::vector<Buffer<T>>{
+ Buffer<T>(context, size * size),
+ Buffer<T>(context, size)
+ };
+
+ // Performance testing
+ const auto results = TimeRoutine(from, to, step, num_runs, queue, buffers, RunTrsvRoutine<T>);
+
+ // Stores the results in the expected format
+ auto scores = std::vector<TuningResult>();
+ for (const auto &result : results) {
+ if (result.second != -1) {
+ auto tuning_results = Configuration();
+ tuning_results["TRSV_BLOCK_SIZE"] = result.first;
+ tuning_results["PRECISION"] = static_cast<size_t>(precision);
+ scores.emplace_back(TuningResult{"trsv_routine", result.second, tuning_results});
+ }
+ }
+
+ // Computes the best result
+ auto best_time = std::numeric_limits<double>::max();
+ auto best_value = size_t{0};
+ for (const auto &result : results) {
+ if (result.second != -1 && result.second < best_time) {
+ best_time = result.second;
+ best_value = result.first;
+ }
+ }
+ const auto best_string = "TRSV_BLOCK_SIZE=" + ToString(best_value);
+
+ // Outputs the results as JSON to disk, including some meta-data
+ const auto precision_string = std::to_string(static_cast<size_t>(precision));
+ auto metadata = std::vector<std::pair<std::string,std::string>>{
+ {"kernel_family", "trsv_routine"},
+ {"precision", precision_string},
+ {"arg_n", ToString(size)},
+ {"best_kernel", "trsv_routine"},
+ {"best_time", ToString(best_time)},
+ {"best_parameters", best_string}
+ };
+ PrintTimingsToFileAsJSON("clblast_routine_xtrsv_" + precision_string + ".json",
+ device, platform, metadata, scores);
+
+ printf("* Completed tuning process\n");
+ printf("\n");
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+ const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
+ switch(clblast::GetPrecision(command_line_args)) {
+ case clblast::Precision::kSingle: clblast::TuneXtrsv<float>(argc, argv); break;
+ case clblast::Precision::kDouble: clblast::TuneXtrsv<double>(argc, argv); break;
+ case clblast::Precision::kComplexSingle: clblast::TuneXtrsv<float2>(argc, argv); break;
+ case clblast::Precision::kComplexDouble: clblast::TuneXtrsv<double2>(argc, argv); break;
+ }
+ return 0;
+}
+
+// =================================================================================================
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index a66aba4b..c167cd5f 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -52,7 +52,7 @@ using Timing = std::pair<size_t, double>;
template <typename T, typename F>
std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t step,
- const size_t num_runs, const Queue& queue,
+ const size_t num_runs, Queue& queue,
const std::vector<Buffer<T>>& buffers, F const &routine) {
auto timings = std::vector<Timing>();
printf("| value | time |\n");