Factored out the creation of the OpenCL header and the program compilation

author: Cedric Nugteren <web@cedricnugteren.nl> 2017-11-11 16:14:43 +0100
committer: Cedric Nugteren <web@cedricnugteren.nl> 2017-11-11 16:14:43 +0100
commit: 677afd3b96b2cbd3d2aae77e90cab87d2cc1eaa2 (patch)
tree: 6df1df0d4ca59a22d0668c9b1d859da3ef32308a /src/routines
parent: c41d219ea42087c1b8d933b733b381005123cb91 (diff)
2 files changed, 84 insertions, 0 deletions
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index 5b178e53..c415d9fd 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -19,6 +19,84 @@
 namespace clblast {
 // =================================================================================================
 
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+                          const std::string &routine_name,
+                          const Device& device, const Context& context,
+                          std::vector<std::string>& options) {
+  auto header_string = std::string{""};
+
+  header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
+
+  // Adds the name of the routine as a define
+  header_string += "#define ROUTINE_" + routine_name + "\n";
+
+  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
+  // which it is known to work with all OpenCL platforms.
+  if (device.IsNVIDIA() || device.IsARM()) {
+    header_string += "#define USE_INLINE_KEYWORD 1\n";
+  }
+
+  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
+  // performance, but might result in a reduced accuracy.
+  if (device.IsAMD() && device.IsGPU()) {
+    header_string += "#define USE_CL_MAD 1\n";
+  }
+
+  // For specific devices, use staggered/shuffled workgroup indices.
+  if (device.IsAMD() && device.IsGPU()) {
+    header_string += "#define USE_STAGGERED_INDICES 1\n";
+  }
+
+  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+  // performance through better cache behaviour
+  if (device.IsARM() && device.IsGPU()) {
+    header_string += "#define GLOBAL_MEM_FENCE 1\n";
+  }
+
+  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
+  #ifdef CUDA_API
+    source_string +=
+      #include "kernels/opencl_to_cuda.h"
+    ;
+  #endif
+
+  // Loads the common header (typedefs and defines and such)
+  header_string +=
+    #include "kernels/common.opencl"
+  ;
+
+  // Prints details of the routine to compile in case of debugging in verbose mode
+  #ifdef VERBOSE
+    printf("[DEBUG] Compiling routine '%s-%s'\n",
+           routine_name.c_str(), ToString(precision).c_str());
+    const auto start_time = std::chrono::steady_clock::now();
+  #endif
+
+  // Compiles the kernel
+  auto program = Program(context, header_string + source_string);
+  try {
+    program.Build(device, options);
+  } catch (const CLCudaAPIBuildError &e) {
+    if (program.StatusIsCompilationWarningOrError(e.status())) {
+      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
+              program.GetBuildInfo(device).c_str());
+    }
+    throw;
+  }
+
+  // Prints the elapsed compilation time in case of debugging in verbose mode
+  #ifdef VERBOSE
+    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
+  #endif
+
+  return program;
+}
+
+// =================================================================================================
+
 // Enqueues a kernel, waits for completion, and checks for errors
 void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                std::vector<size_t> global, const std::vector<size_t> &local,
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index bf3b1762..8a93d74a 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -25,6 +25,12 @@
 namespace clblast {
 // =================================================================================================
 
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+                          const std::string &routine_name,
+                          const Device& device, const Context& context,
+                          std::vector<std::string>& options);
+
 // Enqueues a kernel, waits for completion, and checks for errors
 void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                std::vector<size_t> global, const std::vector<size_t> &local,
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-11-11 16:14:43 +0100
committer	Cedric Nugteren <web@cedricnugteren.nl>	2017-11-11 16:14:43 +0100
commit	677afd3b96b2cbd3d2aae77e90cab87d2cc1eaa2 (patch)
tree	6df1df0d4ca59a22d0668c9b1d859da3ef32308a /src/routines
parent	c41d219ea42087c1b8d933b733b381005123cb91 (diff)