summaryrefslogtreecommitdiff
path: root/src/utilities
diff options
context:
space:
mode:
Diffstat (limited to 'src/utilities')
-rw-r--r--src/utilities/clblast_exceptions.cpp33
-rw-r--r--src/utilities/clblast_exceptions.hpp3
-rw-r--r--src/utilities/compile.cpp99
-rw-r--r--src/utilities/compile.hpp36
-rw-r--r--src/utilities/timing.cpp79
-rw-r--r--src/utilities/timing.hpp71
-rw-r--r--src/utilities/utilities.cpp31
-rw-r--r--src/utilities/utilities.hpp7
8 files changed, 300 insertions, 59 deletions
diff --git a/src/utilities/clblast_exceptions.cpp b/src/utilities/clblast_exceptions.cpp
index 32526215..25e5f4be 100644
--- a/src/utilities/clblast_exceptions.cpp
+++ b/src/utilities/clblast_exceptions.cpp
@@ -45,7 +45,7 @@ RuntimeErrorCode::RuntimeErrorCode(StatusCode status, const std::string &subreas
// =================================================================================================
-StatusCode DispatchException()
+StatusCode DispatchException(const bool silent)
{
const char *message = nullptr;
StatusCode status;
@@ -66,12 +66,41 @@ StatusCode DispatchException()
status = StatusCode::kUnknownError;
}
- if (message) {
+ if (message && !silent) {
fprintf(stderr, "CLBlast: %s\n", message);
}
return status;
}
+StatusCode DispatchExceptionCatchAll(const bool silent)
+{
+ const char *message = nullptr;
+ StatusCode status;
+
+ try {
+ throw;
+ } catch (BLASError &e) {
+ // no message is printed for invalid argument errors
+ status = e.status();
+ } catch (CLCudaAPIError &e) {
+ message = e.what();
+ status = static_cast<StatusCode>(e.status());
+ } catch (RuntimeErrorCode &e) {
+ message = e.what();
+ status = e.status();
+ } catch (Error<std::runtime_error> &e) {
+ message = e.what();
+ status = StatusCode::kUnknownError;
+ } catch (...) {
+ message = "unknown exception type";
+ status = StatusCode::kUnknownError;
+ }
+
+ if (message && !silent) {
+ fprintf(stderr, "CLBlast: %s\n", message);
+ }
+ return status;
+}
// =================================================================================================
StatusCode DispatchExceptionForC()
diff --git a/src/utilities/clblast_exceptions.hpp b/src/utilities/clblast_exceptions.hpp
index a790be9c..9bd38187 100644
--- a/src/utilities/clblast_exceptions.hpp
+++ b/src/utilities/clblast_exceptions.hpp
@@ -37,7 +37,8 @@ class RuntimeErrorCode : public ErrorCode<RuntimeError, StatusCode> {
// =================================================================================================
// Handles (most of the) runtime exceptions and converts them to StatusCode
-StatusCode DispatchException();
+StatusCode DispatchException(const bool silent = false);
+StatusCode DispatchExceptionCatchAll(const bool silent = false);
// Handles remaining exceptions and converts them to StatusCode::kUnhandledError
StatusCode DispatchExceptionForC();
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
new file mode 100644
index 00000000..2a55506e
--- /dev/null
+++ b/src/utilities/compile.cpp
@@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the kernel compilation functions (see the header for more information).
+//
+// =================================================================================================
+
+#include <vector>
+#include <chrono>
+
+#include "routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+ const std::string &routine_name,
+ const Device& device, const Context& context,
+ std::vector<std::string>& options, const bool silent) {
+ auto header_string = std::string{""};
+
+ header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
+
+ // Adds the name of the routine as a define
+ header_string += "#define ROUTINE_" + routine_name + "\n";
+
+ // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
+ // which it is known to work with all OpenCL platforms.
+ if (device.IsNVIDIA() || device.IsARM()) {
+ header_string += "#define USE_INLINE_KEYWORD 1\n";
+ }
+
+ // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
+ // performance, but might result in a reduced accuracy.
+ if (device.IsAMD() && device.IsGPU()) {
+ header_string += "#define USE_CL_MAD 1\n";
+ }
+
+ // For specific devices, use staggered/shuffled workgroup indices.
+ if (device.IsAMD() && device.IsGPU()) {
+ header_string += "#define USE_STAGGERED_INDICES 1\n";
+ }
+
+ // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+ // performance through better cache behaviour
+ if (device.IsARM() && device.IsGPU()) {
+ header_string += "#define GLOBAL_MEM_FENCE 1\n";
+ }
+
+ // Optionally adds a translation header from OpenCL kernels to CUDA kernels
+ #ifdef CUDA_API
+ source_string +=
+ #include "kernels/opencl_to_cuda.h"
+ ;
+ #endif
+
+ // Loads the common header (typedefs and defines and such)
+ header_string +=
+ #include "kernels/common.opencl"
+ ;
+
+ // Prints details of the routine to compile in case of debugging in verbose mode
+ #ifdef VERBOSE
+ printf("[DEBUG] Compiling routine '%s-%s'\n",
+ routine_name.c_str(), ToString(precision).c_str());
+ const auto start_time = std::chrono::steady_clock::now();
+ #endif
+
+ // Compiles the kernel
+ auto program = Program(context, header_string + source_string);
+ try {
+ program.Build(device, options);
+ } catch (const CLCudaAPIBuildError &e) {
+ if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) {
+ fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
+ program.GetBuildInfo(device).c_str());
+ }
+ throw;
+ }
+
+ // Prints the elapsed compilation time in case of debugging in verbose mode
+ #ifdef VERBOSE
+ const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+ const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+ printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
+ #endif
+
+ return program;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp
new file mode 100644
index 00000000..0315d70c
--- /dev/null
+++ b/src/utilities/compile.hpp
@@ -0,0 +1,36 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the CLBlast way to compile a kernel from source, used for the library and for
+// the auto-tuners.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_UTILITIES_COMPILE_H_
+#define CLBLAST_UTILITIES_COMPILE_H_
+
+#include <string>
+#include <vector>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+ const std::string &routine_name,
+ const Device& device, const Context& context,
+ std::vector<std::string>& options, const bool silent = false);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_UTILITIES_COMPILE_H_
+#endif
diff --git a/src/utilities/timing.cpp b/src/utilities/timing.cpp
new file mode 100644
index 00000000..af6a8ff2
--- /dev/null
+++ b/src/utilities/timing.cpp
@@ -0,0 +1,79 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides helper functions for time measurement and such.
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <exception>
+
+#include "utilities/timing.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+ std::vector<size_t> global, const std::vector<size_t> &local) {
+ auto event = Event();
+
+ if (!local.empty()) {
+ // Tests for validity of the local thread sizes
+ if (local.size() > device.MaxWorkItemDimensions()) {
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
+ }
+ const auto max_work_item_sizes = device.MaxWorkItemSizes();
+ for (auto i=size_t{0}; i<local.size(); ++i) {
+ if (local[i] > max_work_item_sizes[i]) {
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
+ }
+ }
+ auto local_size = size_t{1};
+ for (auto &item: local) { local_size *= item; }
+ if (local_size > device.MaxWorkGroupSize()) {
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
+ }
+
+ // Make sure the global thread sizes are at least equal to the local sizes
+ for (auto i=size_t{0}; i<global.size(); ++i) {
+ if (global[i] < local[i]) { global[i] = local[i]; }
+ }
+ }
+
+ // Tests for local memory usage
+ const auto local_mem_usage = kernel.LocalMemUsage(device);
+ if (!device.IsLocalMemoryValid(local_mem_usage)) {
+ throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
+ }
+
+ // Times the kernel
+ const auto run_kernel_func = [&]() {
+ kernel.Launch(queue, global, local, event.pointer());
+ event.WaitForCompletion();
+ queue.Finish();
+ };
+ return TimeFunction(num_runs, run_kernel_func);
+}
+
+double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+ std::vector<size_t> global, const std::vector<size_t> &local) {
+ try {
+ const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local);
+ printf(" %9.2lf ms |", time_ms);
+ return time_ms;
+ }
+ catch (...) {
+ const auto status_code = DispatchExceptionCatchAll(true);
+ printf(" error %-5d |", static_cast<int>(status_code));
+ return -1.0; // invalid
+ }
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index bfad6147..a66aba4b 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -40,6 +40,14 @@ double TimeFunction(const size_t num_runs, F const &function) {
// =================================================================================================
+double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+ std::vector<size_t> global, const std::vector<size_t> &local);
+
+double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
+ std::vector<size_t> global, const std::vector<size_t> &local);
+
+// =================================================================================================
+
using Timing = std::pair<size_t, double>;
template <typename T, typename F>
@@ -47,76 +55,27 @@ std::vector<Timing> TimeRoutine(const size_t from, const size_t to, const size_t
const size_t num_runs, const Queue& queue,
const std::vector<Buffer<T>>& buffers, F const &routine) {
auto timings = std::vector<Timing>();
+ printf("| value | time |\n");
+ printf("x--------x--------------x\n");
for (auto value = from; value < to; value += step) {
- printf("[ RUN ] Running with value %zu\n", value);
+ printf("| %6zu |", value);
try {
const auto FunctionToTune = [&]() { routine(value, queue, buffers); };
const auto time_ms = TimeFunction(num_runs, FunctionToTune);
- printf("[ OK ] Took %.2lf ms\n", time_ms);
+ printf(" %9.2lf ms |\n", time_ms);
timings.push_back({value, time_ms});
}
catch (...) {
- printf("[ ERROR ] Exception caught\n");
+ const auto status_code = DispatchExceptionCatchAll(true);
+ printf(" error %-5d |\n", static_cast<int>(status_code));
timings.push_back({value, -1.0}); // invalid
}
}
+ printf("x--------x--------------x\n");
return timings;
}
// =================================================================================================
-
-using TuningParameter = std::pair<std::string, size_t>;
-using TuningParameters = std::vector<TuningParameter>;
-struct TuningResult { std::string name; double score; TuningParameters parameters; };
-
-void PrintTimingsToFileAsJSON(const std::string &filename,
- const Device& device, const Platform& platform,
- const std::vector<std::pair<std::string,std::string>> &metadata,
- const std::vector<TuningResult>& tuning_results) {
- printf("[ STATUS ] Writing results to '%s'\n", filename.c_str());
- auto file = fopen(filename.c_str(), "w");
- fprintf(file, "{\n");
- for (auto &datum: metadata) {
- fprintf(file, " \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str());
- }
- fprintf(file, " \"platform_version\": \"%s\",\n", platform.Version().c_str());
- fprintf(file, " \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str());
- fprintf(file, " \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str());
- fprintf(file, " \"clblast_device_type\": \"%s\",\n", device.Type().c_str());
- fprintf(file, " \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str());
- fprintf(file, " \"device_core_clock\": \"%zu\",\n", device.CoreClock());
- fprintf(file, " \"device_compute_units\": \"%zu\",\n", device.ComputeUnits());
- fprintf(file, " \"results\": [\n");
-
- // Loops over all results
- auto num_results = tuning_results.size();
- for (auto r = size_t{0}; r < num_results; ++r) {
- auto result = tuning_results[r];
- fprintf(file, " {\n");
- fprintf(file, " \"kernel\": \"%s\",\n", result.name.c_str());
- fprintf(file, " \"time\": %.3lf,\n", result.score);
-
- // Loops over all the parameters for this result
- fprintf(file, " \"parameters\": {");
- auto num_configs = result.parameters.size();
- for (auto p=size_t{0}; p<num_configs; ++p) {
- auto config = result.parameters[p];
- fprintf(file, "\"%s\": %zu", config.first.c_str(), config.second);
- if (p < num_configs-1) { fprintf(file, ","); }
- }
- fprintf(file, "}\n");
-
- // The footer
- fprintf(file, " }");
- if (r < num_results - 1) { fprintf(file, ","); }
- fprintf(file, "\n");
- }
- fprintf(file, " ]\n");
- fprintf(file, "}\n");
- fclose(file);
-}
-
-// =================================================================================================
} // namespace clblast
// CLBLAST_TIMING_H_
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index f2574104..1546fbf5 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -397,6 +397,37 @@ template <> bool PrecisionSupported<half>(const Device &device) { return device.
// =================================================================================================
+// Retrieves the squared difference, used for example for computing the L2 error
+template <typename T>
+double SquaredDifference(const T val1, const T val2) {
+ const auto difference = (val1 - val2);
+ return static_cast<double>(difference * difference);
+}
+
+// Compiles the default case for standard data-types
+template double SquaredDifference<float>(const float, const float);
+template double SquaredDifference<double>(const double, const double);
+
+// Specialisations for non-standard data-types
+template <>
+double SquaredDifference(const float2 val1, const float2 val2) {
+ const auto real = SquaredDifference(val1.real(), val2.real());
+ const auto imag = SquaredDifference(val1.imag(), val2.imag());
+ return real + imag;
+}
+template <>
+double SquaredDifference(const double2 val1, const double2 val2) {
+ const auto real = SquaredDifference(val1.real(), val2.real());
+ const auto imag = SquaredDifference(val1.imag(), val2.imag());
+ return real + imag;
+}
+template <>
+double SquaredDifference(const half val1, const half val2) {
+ return SquaredDifference(HalfToFloat(val1), HalfToFloat(val2));
+}
+
+// =================================================================================================
+
// High-level info
std::string GetDeviceType(const Device& device) {
return device.Type();
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index f56226be..e26721b3 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -98,6 +98,7 @@ constexpr auto kArgDilationW = "dilationw";
// The tuner-specific arguments in string form
constexpr auto kArgFraction = "fraction";
constexpr auto kArgHeuristicSelection = "heuristic";
+constexpr auto kArgMaxL2Norm = "max_l2_norm";
// PSO tuner-specific arguments in string form
constexpr auto kArgPsoSwarmSize = "pso_swarm_size";
constexpr auto kArgPsoInfGlobal = "pso_inf_global";
@@ -323,6 +324,12 @@ bool PrecisionSupported(const Device &device);
// =================================================================================================
+// Retrieves the squared difference, used for example for computing the L2 error
+template <typename T>
+double SquaredDifference(const T val1, const T val2);
+
+// =================================================================================================
+
// Device information in a specific CLBlast form
std::string GetDeviceType(const Device& device);
std::string GetDeviceVendor(const Device& device);