diff options
-rw-r--r-- | CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/utilities/timing.cpp | 79 | ||||
-rw-r--r-- | src/utilities/timing.hpp | 9 |
3 files changed, 89 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index cada61ab..ec757e06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -234,6 +234,7 @@ set(SOURCES src/database/database.cpp src/routines/common.cpp src/utilities/clblast_exceptions.cpp + src/utilities/timing.cpp src/utilities/utilities.cpp src/api_common.cpp src/cache.cpp diff --git a/src/utilities/timing.cpp b/src/utilities/timing.cpp new file mode 100644 index 00000000..188e4487 --- /dev/null +++ b/src/utilities/timing.cpp @@ -0,0 +1,79 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file provides helper functions for time measurement and such. +// +// ================================================================================================= + +#include <cstdio> +#include <exception> + +#include "utilities/timing.hpp" + +namespace clblast { +// ================================================================================================= + +double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local) { + auto event = Event(); + + if (!local.empty()) { + // Tests for validity of the local thread sizes + if (local.size() > device.MaxWorkItemDimensions()) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions); + } + const auto max_work_item_sizes = device.MaxWorkItemSizes(); + for (auto i=size_t{0}; i<local.size(); ++i) { + if (local[i] > max_work_item_sizes[i]) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim); + } + } + auto local_size = size_t{1}; + for (auto &item: local) { local_size *= item; } + if (local_size > device.MaxWorkGroupSize()) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal); + } + + // Make sure the global thread sizes are at least equal to the local sizes + for (auto i=size_t{0}; i<global.size(); ++i) { + if (global[i] < local[i]) { global[i] = local[i]; } + } + } + + // Tests for local memory usage + const auto local_mem_usage = kernel.LocalMemUsage(device); + if (!device.IsLocalMemoryValid(local_mem_usage)) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage); + } + + // Times the kernel + const auto run_kernel_func = [&]() { + kernel.Launch(queue, global, local, event.pointer()); + event.WaitForCompletion(); + queue.Finish(); + }; + return TimeFunction(num_runs, run_kernel_func); +} + +double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local) { + try { + const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local); + printf(" %7.2lf ms |", time_ms); + return time_ms; + } + catch (...) { + const auto status_code = DispatchExceptionCatchAll(true); + printf(" error %3d |", static_cast<int>(status_code)); + return -1.0; // invalid + } +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp index bfad6147..fb5b9e78 100644 --- a/src/utilities/timing.hpp +++ b/src/utilities/timing.hpp @@ -21,6 +21,7 @@ #include <chrono> #include "utilities/utilities.hpp" +#include "routines/common.hpp" namespace clblast { // ================================================================================================= @@ -40,6 +41,14 @@ double TimeFunction(const size_t num_runs, F const &function) { // ================================================================================================= +double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local); + +double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local); + +// ================================================================================================= + using Timing = std::pair<size_t, double>; template <typename T, typename F> |