diff options
Diffstat (limited to 'src/utilities')
-rw-r--r-- | src/utilities/clblast_exceptions.cpp | 33 | ||||
-rw-r--r-- | src/utilities/clblast_exceptions.hpp | 3 | ||||
-rw-r--r-- | src/utilities/compile.cpp | 99 | ||||
-rw-r--r-- | src/utilities/compile.hpp | 36 | ||||
-rw-r--r-- | src/utilities/timing.cpp | 79 | ||||
-rw-r--r-- | src/utilities/timing.hpp | 71 | ||||
-rw-r--r-- | src/utilities/utilities.cpp | 31 | ||||
-rw-r--r-- | src/utilities/utilities.hpp | 7 |
8 files changed, 300 insertions, 59 deletions
diff --git a/src/utilities/clblast_exceptions.cpp b/src/utilities/clblast_exceptions.cpp index 32526215..25e5f4be 100644 --- a/src/utilities/clblast_exceptions.cpp +++ b/src/utilities/clblast_exceptions.cpp @@ -45,7 +45,7 @@ RuntimeErrorCode::RuntimeErrorCode(StatusCode status, const std::string &subreas // ================================================================================================= -StatusCode DispatchException() +StatusCode DispatchException(const bool silent) { const char *message = nullptr; StatusCode status; @@ -66,12 +66,41 @@ StatusCode DispatchException() status = StatusCode::kUnknownError; } - if (message) { + if (message && !silent) { fprintf(stderr, "CLBlast: %s\n", message); } return status; } +StatusCode DispatchExceptionCatchAll(const bool silent) +{ + const char *message = nullptr; + StatusCode status; + + try { + throw; + } catch (BLASError &e) { + // no message is printed for invalid argument errors + status = e.status(); + } catch (CLCudaAPIError &e) { + message = e.what(); + status = static_cast<StatusCode>(e.status()); + } catch (RuntimeErrorCode &e) { + message = e.what(); + status = e.status(); + } catch (Error<std::runtime_error> &e) { + message = e.what(); + status = StatusCode::kUnknownError; + } catch (...) { + message = "unknown exception type"; + status = StatusCode::kUnknownError; + } + + if (message && !silent) { + fprintf(stderr, "CLBlast: %s\n", message); + } + return status; +} // ================================================================================================= StatusCode DispatchExceptionForC() diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp index a790be9c..9bd38187 100644 --- a/src/utilities/clblast_exceptions.hpp +++ b/src/utilities/clblast_exceptions.hpp @@ -37,7 +37,8 @@ class RuntimeErrorCode : public ErrorCode<RuntimeError, StatusCode> { // ================================================================================================= // Handles (most of the) runtime exceptions and converts them to StatusCode -StatusCode DispatchException(); +StatusCode DispatchException(const bool silent = false); +StatusCode DispatchExceptionCatchAll(const bool silent = false); // Handles remaining exceptions and converts them to StatusCode::kUnhandledError StatusCode DispatchExceptionForC(); diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp new file mode 100644 index 00000000..2a55506e --- /dev/null +++ b/src/utilities/compile.cpp @@ -0,0 +1,99 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the kernel compilation functions (see the header for more information). +// +// ================================================================================================= + +#include <vector> +#include <chrono> + +#include "routines/common.hpp" + +namespace clblast { +// ================================================================================================= + +// Compiles a program from source code +Program CompileFromSource(const std::string &source_string, const Precision precision, + const std::string &routine_name, + const Device& device, const Context& context, + std::vector<std::string>& options, const bool silent) { + auto header_string = std::string{""}; + + header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n"; + + // Adds the name of the routine as a define + header_string += "#define ROUTINE_" + routine_name + "\n"; + + // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on + // which it is known to work with all OpenCL platforms. + if (device.IsNVIDIA() || device.IsARM()) { + header_string += "#define USE_INLINE_KEYWORD 1\n"; + } + + // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve + // performance, but might result in a reduced accuracy. + if (device.IsAMD() && device.IsGPU()) { + header_string += "#define USE_CL_MAD 1\n"; + } + + // For specific devices, use staggered/shuffled workgroup indices. + if (device.IsAMD() && device.IsGPU()) { + header_string += "#define USE_STAGGERED_INDICES 1\n"; + } + + // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize + // performance through better cache behaviour + if (device.IsARM() && device.IsGPU()) { + header_string += "#define GLOBAL_MEM_FENCE 1\n"; + } + + // Optionally adds a translation header from OpenCL kernels to CUDA kernels + #ifdef CUDA_API + source_string += + #include "kernels/opencl_to_cuda.h" + ; + #endif + + // Loads the common header (typedefs and defines and such) + header_string += + #include "kernels/common.opencl" + ; + + // Prints details of the routine to compile in case of debugging in verbose mode + #ifdef VERBOSE + printf("[DEBUG] Compiling routine '%s-%s'\n", + routine_name.c_str(), ToString(precision).c_str()); + const auto start_time = std::chrono::steady_clock::now(); + #endif + + // Compiles the kernel + auto program = Program(context, header_string + source_string); + try { + program.Build(device, options); + } catch (const CLCudaAPIBuildError &e) { + if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) { + fprintf(stdout, "OpenCL compiler error/warning:\n%s\n", + program.GetBuildInfo(device).c_str()); + } + throw; + } + + // Prints the elapsed compilation time in case of debugging in verbose mode + #ifdef VERBOSE + const auto elapsed_time = std::chrono::steady_clock::now() - start_time; + const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count(); + printf("[DEBUG] Completed compilation in %.2lf ms\n", timing); + #endif + + return program; +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp new file mode 100644 index 00000000..0315d70c --- /dev/null +++ b/src/utilities/compile.hpp @@ -0,0 +1,36 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the CLBlast way to compile a kernel from source, used for the library and for +// the auto-tuners. +// +// ================================================================================================= + +#ifndef CLBLAST_UTILITIES_COMPILE_H_ +#define CLBLAST_UTILITIES_COMPILE_H_ + +#include <string> +#include <vector> + +#include "utilities/utilities.hpp" + +namespace clblast { +// ================================================================================================= + +// Compiles a program from source code +Program CompileFromSource(const std::string &source_string, const Precision precision, + const std::string &routine_name, + const Device& device, const Context& context, + std::vector<std::string>& options, const bool silent = false); + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_UTILITIES_COMPILE_H_ +#endif diff --git a/src/utilities/timing.cpp b/src/utilities/timing.cpp new file mode 100644 index 00000000..af6a8ff2 --- /dev/null +++ b/src/utilities/timing.cpp @@ -0,0 +1,79 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file provides helper functions for time measurement and such. +// +// ================================================================================================= + +#include <cstdio> +#include <exception> + +#include "utilities/timing.hpp" + +namespace clblast { +// ================================================================================================= + +double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local) { + auto event = Event(); + + if (!local.empty()) { + // Tests for validity of the local thread sizes + if (local.size() > device.MaxWorkItemDimensions()) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions); + } + const auto max_work_item_sizes = device.MaxWorkItemSizes(); + for (auto i=size_t{0}; i<local.size(); ++i) { + if (local[i] > max_work_item_sizes[i]) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim); + } + } + auto local_size = size_t{1}; + for (auto &item: local) { local_size *= item; } + if (local_size > device.MaxWorkGroupSize()) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal); + } + + // Make sure the global thread sizes are at least equal to the local sizes + for (auto i=size_t{0}; i<global.size(); ++i) { + if (global[i] < local[i]) { global[i] = local[i]; } + } + } + + // Tests for local memory usage + const auto local_mem_usage = kernel.LocalMemUsage(device); + if (!device.IsLocalMemoryValid(local_mem_usage)) { + throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage); + } + + // Times the kernel + const auto run_kernel_func = [&]() { + kernel.Launch(queue, global, local, event.pointer()); + event.WaitForCompletion(); + queue.Finish(); + }; + return TimeFunction(num_runs, run_kernel_func); +} + +double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local) { + try { + const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local); + printf(" %9.2lf ms |", time_ms); + return time_ms; + } + catch (...) { + const auto status_code = DispatchExceptionCatchAll(true); + printf(" error %-5d |", static_cast<int>(status_code)); + return -1.0; // invalid + } +} + +// ================================================================================================= +} // namespace clblast diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp index bfad6147..a66aba4b 100644 --- a/src/utilities/timing.hpp +++ b/src/utilities/timing.hpp @@ -40,6 +40,14 @@ double TimeFunction(const size_t num_runs, F const &function) { // ================================================================================================= +double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local); + +double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, + std::vector<size_t> global, const std::vector<size_t> &local); + +// ================================================================================================= + using Timing = std::pair<size_t, double>; template <typename T, typename F> @@ -47,76 +55,27 @@ std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t const size_t num_runs, const Queue& queue, const std::vector<Buffer<T>>& buffers, F const &routine) { auto timings = std::vector<Timing>(); + printf("| value | time |\n"); + printf("x--------x--------------x\n"); for (auto value = from; value < to; value += step) { - printf("[ RUN ] Running with value %zu\n", value); + printf("| %6zu |", value); try { const auto FunctionToTune = [&]() { routine(value, queue, buffers); }; const auto time_ms = TimeFunction(num_runs, FunctionToTune); - printf("[ OK ] Took %.2lf ms\n", time_ms); + printf(" %9.2lf ms |\n", time_ms); timings.push_back({value, time_ms}); } catch (...) { - printf("[ ERROR ] Exception caught\n"); + const auto status_code = DispatchExceptionCatchAll(true); + printf(" error %-5d |\n", static_cast<int>(status_code)); timings.push_back({value, -1.0}); // invalid } } + printf("x--------x--------------x\n"); return timings; } // ================================================================================================= - -using TuningParameter = std::pair<std::string, size_t>; -using TuningParameters = std::vector<TuningParameter>; -struct TuningResult { std::string name; double score; TuningParameters parameters; }; - -void PrintTimingsToFileAsJSON(const std::string &filename, - const Device& device, const Platform& platform, - const std::vector<std::pair<std::string,std::string>> &metadata, - const std::vector<TuningResult>& tuning_results) { - printf("[ STATUS ] Writing results to '%s'\n", filename.c_str()); - auto file = fopen(filename.c_str(), "w"); - fprintf(file, "{\n"); - for (auto &datum: metadata) { - fprintf(file, " \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str()); - } - fprintf(file, " \"platform_version\": \"%s\",\n", platform.Version().c_str()); - fprintf(file, " \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str()); - fprintf(file, " \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str()); - fprintf(file, " \"clblast_device_type\": \"%s\",\n", device.Type().c_str()); - fprintf(file, " \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str()); - fprintf(file, " \"device_core_clock\": \"%zu\",\n", device.CoreClock()); - fprintf(file, " \"device_compute_units\": \"%zu\",\n", device.ComputeUnits()); - fprintf(file, " \"results\": [\n"); - - // Loops over all results - auto num_results = tuning_results.size(); - for (auto r = size_t{0}; r < num_results; ++r) { - auto result = tuning_results[r]; - fprintf(file, " {\n"); - fprintf(file, " \"kernel\": \"%s\",\n", result.name.c_str()); - fprintf(file, " \"time\": %.3lf,\n", result.score); - - // Loops over all the parameters for this result - fprintf(file, " \"parameters\": {"); - auto num_configs = result.parameters.size(); - for (auto p=size_t{0}; p<num_configs; ++p) { - auto config = result.parameters[p]; - fprintf(file, "\"%s\": %zu", config.first.c_str(), config.second); - if (p < num_configs-1) { fprintf(file, ","); } - } - fprintf(file, "}\n"); - - // The footer - fprintf(file, " }"); - if (r < num_results - 1) { fprintf(file, ","); } - fprintf(file, "\n"); - } - fprintf(file, " ]\n"); - fprintf(file, "}\n"); - fclose(file); -} - -// ================================================================================================= } // namespace clblast // CLBLAST_TIMING_H_ diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index f2574104..1546fbf5 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -397,6 +397,37 @@ template <> bool PrecisionSupported<half>(const Device &device) { return device. // ================================================================================================= +// Retrieves the squared difference, used for example for computing the L2 error +template <typename T> +double SquaredDifference(const T val1, const T val2) { + const auto difference = (val1 - val2); + return static_cast<double>(difference * difference); +} + +// Compiles the default case for standard data-types +template double SquaredDifference<float>(const float, const float); +template double SquaredDifference<double>(const double, const double); + +// Specialisations for non-standard data-types +template <> +double SquaredDifference(const float2 val1, const float2 val2) { + const auto real = SquaredDifference(val1.real(), val2.real()); + const auto imag = SquaredDifference(val1.imag(), val2.imag()); + return real + imag; +} +template <> +double SquaredDifference(const double2 val1, const double2 val2) { + const auto real = SquaredDifference(val1.real(), val2.real()); + const auto imag = SquaredDifference(val1.imag(), val2.imag()); + return real + imag; +} +template <> +double SquaredDifference(const half val1, const half val2) { + return SquaredDifference(HalfToFloat(val1), HalfToFloat(val2)); +} + +// ================================================================================================= + // High-level info std::string GetDeviceType(const Device& device) { return device.Type(); diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index f56226be..e26721b3 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -98,6 +98,7 @@ constexpr auto kArgDilationW = "dilationw"; // The tuner-specific arguments in string form constexpr auto kArgFraction = "fraction"; constexpr auto kArgHeuristicSelection = "heuristic"; +constexpr auto kArgMaxL2Norm = "max_l2_norm"; // PSO tuner-specific arguments in string form constexpr auto kArgPsoSwarmSize = "pso_swarm_size"; constexpr auto kArgPsoInfGlobal = "pso_inf_global"; @@ -323,6 +324,12 @@ bool PrecisionSupported(const Device &device); // ================================================================================================= +// Retrieves the squared difference, used for example for computing the L2 error +template <typename T> +double SquaredDifference(const T val1, const T val2); + +// ================================================================================================= + // Device information in a specific CLBlast form std::string GetDeviceType(const Device& device); std::string GetDeviceVendor(const Device& device); |