diff options
-rw-r--r-- | CHANGELOG | 2 | ||||
-rw-r--r-- | src/clpp11.hpp | 1 | ||||
-rw-r--r-- | src/cupp11.hpp | 1 | ||||
-rw-r--r-- | src/routine.cpp | 2 | ||||
-rw-r--r-- | src/tuning/tuning.hpp | 4 | ||||
-rw-r--r-- | src/utilities/compile.cpp | 12 | ||||
-rw-r--r-- | src/utilities/compile.hpp | 2 | ||||
-rw-r--r-- | test/correctness/misc/preprocessor.cpp | 6 |
8 files changed, 20 insertions, 10 deletions
@@ -2,6 +2,8 @@ Development (next version) - Re-designed and integrated the auto-tuner, no more dependency on CLTune - Made it possible to override the tuning parameters in the clients straight from JSON tuning files +- Added OpenCL pre-processor to unroll loops and perform array-to-register promotions for compilers + which don't this themselves (ARM, Qualcomm) - greatly improves performance on these platforms - Added tuned parameters for various devices (see README) Version 1.2.0 diff --git a/src/clpp11.hpp b/src/clpp11.hpp index a958a4a8..efb1bcfa 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -333,6 +333,7 @@ class Device { Vendor() == "GenuineIntel" || Vendor() == "Intel(R) Corporation"; } bool IsARM() const { return Vendor() == "ARM"; } + bool IsQualcomm() const { return Vendor() == "QUALCOMM"; } // Platform specific extensions std::string AMDBoardName() const { // check for 'cl_amd_device_attribute_query' first diff --git a/src/cupp11.hpp b/src/cupp11.hpp index 41de0951..eb177ca2 100644 --- a/src/cupp11.hpp +++ b/src/cupp11.hpp @@ -321,6 +321,7 @@ public: bool IsNVIDIA() const { return true; } bool IsIntel() const { return false; } bool IsARM() const { return false; } + bool IsQualcomm() const { return false; } // Platform specific extensions std::string AMDBoardName() const { return ""; } diff --git a/src/routine.cpp b/src/routine.cpp index 8c9317d1..5a1c0fe9 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -148,7 +148,7 @@ void Routine::InitProgram(std::initializer_list<const char *> source) { // Completes the source and compiles the kernel program_ = CompileFromSource(source_string, precision_, routine_name_, - device_, context_, options, false); + device_, context_, options, 0); // Store the compiled binary and program in the cache diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index 329314e5..1d19a17c 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -227,7 +227,7 @@ void Tuner(int argc, char* argv[]) { // Compiles the kernel auto compiler_options = std::vector<std::string>(); const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name, - device, context, compiler_options, false); + device, context, compiler_options, 0); auto kernel = Kernel(program, settings.kernel_name); C::SetArguments(kernel, args, device_buffers); printf(" %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str()); @@ -286,7 +286,7 @@ void Tuner(int argc, char* argv[]) { const auto start_time = std::chrono::steady_clock::now(); auto compiler_options = std::vector<std::string>(); const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name, - device, context, compiler_options, false, true); + device, context, compiler_options, 0, true); auto kernel = Kernel(program, settings.kernel_name); const auto elapsed_time = std::chrono::steady_clock::now() - start_time; const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count(); diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index 4d1e8929..6243d196 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -25,7 +25,8 @@ Program CompileFromSource(const std::string &source_string, const Precision prec const std::string &routine_name, const Device& device, const Context& context, std::vector<std::string>& options, - const bool run_preprocessor, const bool silent) { + const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never + const bool silent) { auto header_string = std::string{""}; header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n"; @@ -75,9 +76,14 @@ Program CompileFromSource(const std::string &source_string, const Precision prec const auto start_time = std::chrono::steady_clock::now(); #endif - // Runs a pre-processor to unroll loops and perform array-to-register promotion + // Runs a pre-processor to unroll loops and perform array-to-register promotion. Most OpenCL + // compilers do this, but some don't. + auto do_run_preprocessor = false; + if (run_preprocessor == 0) { do_run_preprocessor = (device.IsARM() && device.IsGPU()) || + (device.IsQualcomm() && device.IsGPU()); } + if (run_preprocessor == 1) { do_run_preprocessor = true; } auto kernel_string = header_string + source_string; - if (run_preprocessor) { + if (do_run_preprocessor) { log_debug("Running built-in pre-processor"); kernel_string = PreprocessKernelSource(kernel_string); } diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp index 0df2ded5..1b4f4a7a 100644 --- a/src/utilities/compile.hpp +++ b/src/utilities/compile.hpp @@ -28,7 +28,7 @@ Program CompileFromSource(const std::string &source_string, const Precision prec const std::string &routine_name, const Device& device, const Context& context, std::vector<std::string>& options, - const bool run_preprocessor, + const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never const bool silent = false); // ================================================================================================= diff --git a/test/correctness/misc/preprocessor.cpp b/test/correctness/misc/preprocessor.cpp index b6a12a38..92ca2490 100644 --- a/test/correctness/misc/preprocessor.cpp +++ b/test/correctness/misc/preprocessor.cpp @@ -120,13 +120,13 @@ bool TestKernel(const Device& device, const Context& context, // Verifies that the current kernel compiles properly (assumes so, otherwise throws an error) auto compiler_options_ref = std::vector<std::string>(); const auto program_ref = CompileFromSource(kernel_source, precision, kernel_name, - device, context, compiler_options_ref, false); + device, context, compiler_options_ref, 2); // Compiles the same kernel, but now with the pre-processor enabled try { auto compiler_options = std::vector<std::string>(); const auto program = CompileFromSource(kernel_source, precision, kernel_name, - device, context, compiler_options, true); + device, context, compiler_options, 1); return true; } catch (const CLCudaAPIBuildError &e) { fprintf(stdout, "* ERROR: Compilation warnings/errors with pre-processed kernel, status %d\n", @@ -219,7 +219,7 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/transpose_pad.opencl" ; - //if (TestKernel(device, context, "TransposePadMatrix", transpose_pad_sources, precision)) { passed++; } else { errors++; } + if (TestKernel(device, context, "TransposePadMatrix", transpose_pad_sources, precision)) { passed++; } else { errors++; } // GEMM (in-direct) const auto gemm_sources = |