From 677afd3b96b2cbd3d2aae77e90cab87d2cc1eaa2 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 11 Nov 2017 16:14:43 +0100 Subject: Factored out the creation of the OpenCL header and the program compilation --- src/routine.cpp | 68 +++--------------------------------------- src/routines/common.cpp | 78 +++++++++++++++++++++++++++++++++++++++++++++++++ src/routines/common.hpp | 6 ++++ 3 files changed, 88 insertions(+), 64 deletions(-) diff --git a/src/routine.cpp b/src/routine.cpp index 81201eea..93882fbf 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -135,74 +135,21 @@ void Routine::InitProgram(std::initializer_list source) { throw RuntimeErrorCode(StatusCode::kNoHalfPrecision); } - // Collects the parameters for this device in the form of defines, and adds the precision + // Collects the parameters for this device in the form of defines auto source_string = std::string{""}; for (const auto &kernel_name : kernel_names_) { source_string += db_(kernel_name).GetDefines(); } - source_string += "#define PRECISION "+ToString(static_cast(precision_))+"\n"; - - // Adds the name of the routine as a define - source_string += "#define ROUTINE_"+routine_name_+"\n"; - - // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on - // which it is known to work with all OpenCL platforms. - if (device_.IsNVIDIA() || device_.IsARM()) { - source_string += "#define USE_INLINE_KEYWORD 1\n"; - } - - // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve - // performance, but might result in a reduced accuracy. - if (device_.IsAMD() && device_.IsGPU()) { - source_string += "#define USE_CL_MAD 1\n"; - } - - // For specific devices, use staggered/shuffled workgroup indices. - if (device_.IsAMD() && device_.IsGPU()) { - source_string += "#define USE_STAGGERED_INDICES 1\n"; - } - - // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize - // performance through better cache behaviour - if (device_.IsARM() && device_.IsGPU()) { - source_string += "#define GLOBAL_MEM_FENCE 1\n"; - } - - // Optionally adds a translation header from OpenCL kernels to CUDA kernels - #ifdef CUDA_API - source_string += - #include "kernels/opencl_to_cuda.h" - ; - #endif - - // Loads the common header (typedefs and defines and such) - source_string += - #include "kernels/common.opencl" - ; // Adds routine-specific code to the constructed source string for (const char *s: source) { source_string += s; } - // Prints details of the routine to compile in case of debugging in verbose mode - #ifdef VERBOSE - printf("[DEBUG] Compiling routine '%s-%s' for device '%s'\n", - routine_name_.c_str(), ToString(precision_).c_str(), device_name.c_str()); - const auto start_time = std::chrono::steady_clock::now(); - #endif + // Completes the source and compiles the kernel + program_ = CompileFromSource(source_string, precision_, routine_name_, + device_, context_, options); - // Compiles the kernel - program_ = Program(context_, source_string); - try { - program_.Build(device_, options); - } catch (const CLCudaAPIBuildError &e) { - if (program_.StatusIsCompilationWarningOrError(e.status())) { - fprintf(stdout, "OpenCL compiler error/warning: %s\n", - program_.GetBuildInfo(device_).c_str()); - } - throw; - } // Store the compiled binary and program in the cache BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name}, @@ -210,13 +157,6 @@ void Routine::InitProgram(std::initializer_list source) { ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info}, Program{ program_ }); - - // Prints the elapsed compilation time in case of debugging in verbose mode - #ifdef VERBOSE - const auto elapsed_time = std::chrono::steady_clock::now() - start_time; - const auto timing = std::chrono::duration(elapsed_time).count(); - printf("[DEBUG] Completed compilation in %.2lf ms\n", timing); - #endif } // ================================================================================================= diff --git a/src/routines/common.cpp b/src/routines/common.cpp index 5b178e53..c415d9fd 100644 --- a/src/routines/common.cpp +++ b/src/routines/common.cpp @@ -19,6 +19,84 @@ namespace clblast { // ================================================================================================= +// Compiles a program from source code +Program CompileFromSource(const std::string &source_string, const Precision precision, + const std::string &routine_name, + const Device& device, const Context& context, + std::vector& options) { + auto header_string = std::string{""}; + + header_string += "#define PRECISION " + ToString(static_cast(precision)) + "\n"; + + // Adds the name of the routine as a define + header_string += "#define ROUTINE_" + routine_name + "\n"; + + // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on + // which it is known to work with all OpenCL platforms. + if (device.IsNVIDIA() || device.IsARM()) { + header_string += "#define USE_INLINE_KEYWORD 1\n"; + } + + // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve + // performance, but might result in a reduced accuracy. + if (device.IsAMD() && device.IsGPU()) { + header_string += "#define USE_CL_MAD 1\n"; + } + + // For specific devices, use staggered/shuffled workgroup indices. + if (device.IsAMD() && device.IsGPU()) { + header_string += "#define USE_STAGGERED_INDICES 1\n"; + } + + // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize + // performance through better cache behaviour + if (device.IsARM() && device.IsGPU()) { + header_string += "#define GLOBAL_MEM_FENCE 1\n"; + } + + // Optionally adds a translation header from OpenCL kernels to CUDA kernels + #ifdef CUDA_API + source_string += + #include "kernels/opencl_to_cuda.h" + ; + #endif + + // Loads the common header (typedefs and defines and such) + header_string += + #include "kernels/common.opencl" + ; + + // Prints details of the routine to compile in case of debugging in verbose mode + #ifdef VERBOSE + printf("[DEBUG] Compiling routine '%s-%s'\n", + routine_name.c_str(), ToString(precision).c_str()); + const auto start_time = std::chrono::steady_clock::now(); + #endif + + // Compiles the kernel + auto program = Program(context, header_string + source_string); + try { + program.Build(device, options); + } catch (const CLCudaAPIBuildError &e) { + if (program.StatusIsCompilationWarningOrError(e.status())) { + fprintf(stdout, "OpenCL compiler error/warning: %s\n", + program.GetBuildInfo(device).c_str()); + } + throw; + } + + // Prints the elapsed compilation time in case of debugging in verbose mode + #ifdef VERBOSE + const auto elapsed_time = std::chrono::steady_clock::now() - start_time; + const auto timing = std::chrono::duration(elapsed_time).count(); + printf("[DEBUG] Completed compilation in %.2lf ms\n", timing); + #endif + + return program; +} + +// ================================================================================================= + // Enqueues a kernel, waits for completion, and checks for errors void RunKernel(Kernel &kernel, Queue &queue, const Device &device, std::vector global, const std::vector &local, diff --git a/src/routines/common.hpp b/src/routines/common.hpp index bf3b1762..8a93d74a 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -25,6 +25,12 @@ namespace clblast { // ================================================================================================= +// Compiles a program from source code +Program CompileFromSource(const std::string &source_string, const Precision precision, + const std::string &routine_name, + const Device& device, const Context& context, + std::vector& options); + // Enqueues a kernel, waits for completion, and checks for errors void RunKernel(Kernel &kernel, Queue &queue, const Device &device, std::vector global, const std::vector &local, -- cgit v1.2.3