summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-11-11 16:14:43 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-11-11 16:14:43 +0100
commit677afd3b96b2cbd3d2aae77e90cab87d2cc1eaa2 (patch)
tree6df1df0d4ca59a22d0668c9b1d859da3ef32308a
parentc41d219ea42087c1b8d933b733b381005123cb91 (diff)
Factored out the creation of the OpenCL header and the program compilation
-rw-r--r--src/routine.cpp68
-rw-r--r--src/routines/common.cpp78
-rw-r--r--src/routines/common.hpp6
3 files changed, 88 insertions, 64 deletions
diff --git a/src/routine.cpp b/src/routine.cpp
index 81201eea..93882fbf 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -135,74 +135,21 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
}
- // Collects the parameters for this device in the form of defines, and adds the precision
+ // Collects the parameters for this device in the form of defines
auto source_string = std::string{""};
for (const auto &kernel_name : kernel_names_) {
source_string += db_(kernel_name).GetDefines();
}
- source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
-
- // Adds the name of the routine as a define
- source_string += "#define ROUTINE_"+routine_name_+"\n";
-
- // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
- // which it is known to work with all OpenCL platforms.
- if (device_.IsNVIDIA() || device_.IsARM()) {
- source_string += "#define USE_INLINE_KEYWORD 1\n";
- }
-
- // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
- // performance, but might result in a reduced accuracy.
- if (device_.IsAMD() && device_.IsGPU()) {
- source_string += "#define USE_CL_MAD 1\n";
- }
-
- // For specific devices, use staggered/shuffled workgroup indices.
- if (device_.IsAMD() && device_.IsGPU()) {
- source_string += "#define USE_STAGGERED_INDICES 1\n";
- }
-
- // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
- // performance through better cache behaviour
- if (device_.IsARM() && device_.IsGPU()) {
- source_string += "#define GLOBAL_MEM_FENCE 1\n";
- }
-
- // Optionally adds a translation header from OpenCL kernels to CUDA kernels
- #ifdef CUDA_API
- source_string +=
- #include "kernels/opencl_to_cuda.h"
- ;
- #endif
-
- // Loads the common header (typedefs and defines and such)
- source_string +=
- #include "kernels/common.opencl"
- ;
// Adds routine-specific code to the constructed source string
for (const char *s: source) {
source_string += s;
}
- // Prints details of the routine to compile in case of debugging in verbose mode
- #ifdef VERBOSE
- printf("[DEBUG] Compiling routine '%s-%s' for device '%s'\n",
- routine_name_.c_str(), ToString(precision_).c_str(), device_name.c_str());
- const auto start_time = std::chrono::steady_clock::now();
- #endif
+ // Completes the source and compiles the kernel
+ program_ = CompileFromSource(source_string, precision_, routine_name_,
+ device_, context_, options);
- // Compiles the kernel
- program_ = Program(context_, source_string);
- try {
- program_.Build(device_, options);
- } catch (const CLCudaAPIBuildError &e) {
- if (program_.StatusIsCompilationWarningOrError(e.status())) {
- fprintf(stdout, "OpenCL compiler error/warning: %s\n",
- program_.GetBuildInfo(device_).c_str());
- }
- throw;
- }
// Store the compiled binary and program in the cache
BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name},
@@ -210,13 +157,6 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info},
Program{ program_ });
-
- // Prints the elapsed compilation time in case of debugging in verbose mode
- #ifdef VERBOSE
- const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
- const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
- printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
- #endif
}
// =================================================================================================
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index 5b178e53..c415d9fd 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -19,6 +19,84 @@
namespace clblast {
// =================================================================================================
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+ const std::string &routine_name,
+ const Device& device, const Context& context,
+ std::vector<std::string>& options) {
+ auto header_string = std::string{""};
+
+ header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
+
+ // Adds the name of the routine as a define
+ header_string += "#define ROUTINE_" + routine_name + "\n";
+
+ // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
+ // which it is known to work with all OpenCL platforms.
+ if (device.IsNVIDIA() || device.IsARM()) {
+ header_string += "#define USE_INLINE_KEYWORD 1\n";
+ }
+
+ // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
+ // performance, but might result in a reduced accuracy.
+ if (device.IsAMD() && device.IsGPU()) {
+ header_string += "#define USE_CL_MAD 1\n";
+ }
+
+ // For specific devices, use staggered/shuffled workgroup indices.
+ if (device.IsAMD() && device.IsGPU()) {
+ header_string += "#define USE_STAGGERED_INDICES 1\n";
+ }
+
+ // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+ // performance through better cache behaviour
+ if (device.IsARM() && device.IsGPU()) {
+ header_string += "#define GLOBAL_MEM_FENCE 1\n";
+ }
+
+ // Optionally adds a translation header from OpenCL kernels to CUDA kernels
+ #ifdef CUDA_API
+ source_string +=
+ #include "kernels/opencl_to_cuda.h"
+ ;
+ #endif
+
+ // Loads the common header (typedefs and defines and such)
+ header_string +=
+ #include "kernels/common.opencl"
+ ;
+
+ // Prints details of the routine to compile in case of debugging in verbose mode
+ #ifdef VERBOSE
+ printf("[DEBUG] Compiling routine '%s-%s'\n",
+ routine_name.c_str(), ToString(precision).c_str());
+ const auto start_time = std::chrono::steady_clock::now();
+ #endif
+
+ // Compiles the kernel
+ auto program = Program(context, header_string + source_string);
+ try {
+ program.Build(device, options);
+ } catch (const CLCudaAPIBuildError &e) {
+ if (program.StatusIsCompilationWarningOrError(e.status())) {
+ fprintf(stdout, "OpenCL compiler error/warning: %s\n",
+ program.GetBuildInfo(device).c_str());
+ }
+ throw;
+ }
+
+ // Prints the elapsed compilation time in case of debugging in verbose mode
+ #ifdef VERBOSE
+ const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+ const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+ printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
+ #endif
+
+ return program;
+}
+
+// =================================================================================================
+
// Enqueues a kernel, waits for completion, and checks for errors
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index bf3b1762..8a93d74a 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -25,6 +25,12 @@
namespace clblast {
// =================================================================================================
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+ const std::string &routine_name,
+ const Device& device, const Context& context,
+ std::vector<std::string>& options);
+
// Enqueues a kernel, waits for completion, and checks for errors
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,