summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG2
-rw-r--r--src/clpp11.hpp1
-rw-r--r--src/cupp11.hpp1
-rw-r--r--src/routine.cpp2
-rw-r--r--src/tuning/tuning.hpp4
-rw-r--r--src/utilities/compile.cpp12
-rw-r--r--src/utilities/compile.hpp2
-rw-r--r--test/correctness/misc/preprocessor.cpp6
8 files changed, 20 insertions, 10 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 9c0c64cf..ef16cd0d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,8 @@
Development (next version)
- Re-designed and integrated the auto-tuner, no more dependency on CLTune
- Made it possible to override the tuning parameters in the clients straight from JSON tuning files
+- Added OpenCL pre-processor to unroll loops and perform array-to-register promotions for compilers
+ which don't this themselves (ARM, Qualcomm) - greatly improves performance on these platforms
- Added tuned parameters for various devices (see README)
Version 1.2.0
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index a958a4a8..efb1bcfa 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -333,6 +333,7 @@ class Device {
Vendor() == "GenuineIntel" ||
Vendor() == "Intel(R) Corporation"; }
bool IsARM() const { return Vendor() == "ARM"; }
+ bool IsQualcomm() const { return Vendor() == "QUALCOMM"; }
// Platform specific extensions
std::string AMDBoardName() const { // check for 'cl_amd_device_attribute_query' first
diff --git a/src/cupp11.hpp b/src/cupp11.hpp
index 41de0951..eb177ca2 100644
--- a/src/cupp11.hpp
+++ b/src/cupp11.hpp
@@ -321,6 +321,7 @@ public:
bool IsNVIDIA() const { return true; }
bool IsIntel() const { return false; }
bool IsARM() const { return false; }
+ bool IsQualcomm() const { return false; }
// Platform specific extensions
std::string AMDBoardName() const { return ""; }
diff --git a/src/routine.cpp b/src/routine.cpp
index 8c9317d1..5a1c0fe9 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -148,7 +148,7 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
// Completes the source and compiles the kernel
program_ = CompileFromSource(source_string, precision_, routine_name_,
- device_, context_, options, false);
+ device_, context_, options, 0);
// Store the compiled binary and program in the cache
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 329314e5..1d19a17c 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -227,7 +227,7 @@ void Tuner(int argc, char* argv[]) {
// Compiles the kernel
auto compiler_options = std::vector<std::string>();
const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name,
- device, context, compiler_options, false);
+ device, context, compiler_options, 0);
auto kernel = Kernel(program, settings.kernel_name);
C::SetArguments(kernel, args, device_buffers);
printf(" %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str());
@@ -286,7 +286,7 @@ void Tuner(int argc, char* argv[]) {
const auto start_time = std::chrono::steady_clock::now();
auto compiler_options = std::vector<std::string>();
const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name,
- device, context, compiler_options, false, true);
+ device, context, compiler_options, 0, true);
auto kernel = Kernel(program, settings.kernel_name);
const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index 4d1e8929..6243d196 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -25,7 +25,8 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options,
- const bool run_preprocessor, const bool silent) {
+ const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never
+ const bool silent) {
auto header_string = std::string{""};
header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
@@ -75,9 +76,14 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
const auto start_time = std::chrono::steady_clock::now();
#endif
- // Runs a pre-processor to unroll loops and perform array-to-register promotion
+ // Runs a pre-processor to unroll loops and perform array-to-register promotion. Most OpenCL
+ // compilers do this, but some don't.
+ auto do_run_preprocessor = false;
+ if (run_preprocessor == 0) { do_run_preprocessor = (device.IsARM() && device.IsGPU()) ||
+ (device.IsQualcomm() && device.IsGPU()); }
+ if (run_preprocessor == 1) { do_run_preprocessor = true; }
auto kernel_string = header_string + source_string;
- if (run_preprocessor) {
+ if (do_run_preprocessor) {
log_debug("Running built-in pre-processor");
kernel_string = PreprocessKernelSource(kernel_string);
}
diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp
index 0df2ded5..1b4f4a7a 100644
--- a/src/utilities/compile.hpp
+++ b/src/utilities/compile.hpp
@@ -28,7 +28,7 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options,
- const bool run_preprocessor,
+ const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never
const bool silent = false);
// =================================================================================================
diff --git a/test/correctness/misc/preprocessor.cpp b/test/correctness/misc/preprocessor.cpp
index b6a12a38..92ca2490 100644
--- a/test/correctness/misc/preprocessor.cpp
+++ b/test/correctness/misc/preprocessor.cpp
@@ -120,13 +120,13 @@ bool TestKernel(const Device& device, const Context& context,
// Verifies that the current kernel compiles properly (assumes so, otherwise throws an error)
auto compiler_options_ref = std::vector<std::string>();
const auto program_ref = CompileFromSource(kernel_source, precision, kernel_name,
- device, context, compiler_options_ref, false);
+ device, context, compiler_options_ref, 2);
// Compiles the same kernel, but now with the pre-processor enabled
try {
auto compiler_options = std::vector<std::string>();
const auto program = CompileFromSource(kernel_source, precision, kernel_name,
- device, context, compiler_options, true);
+ device, context, compiler_options, 1);
return true;
} catch (const CLCudaAPIBuildError &e) {
fprintf(stdout, "* ERROR: Compilation warnings/errors with pre-processed kernel, status %d\n",
@@ -219,7 +219,7 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio
#include "../src/kernels/level3/level3.opencl"
#include "../src/kernels/level3/transpose_pad.opencl"
;
- //if (TestKernel(device, context, "TransposePadMatrix", transpose_pad_sources, precision)) { passed++; } else { errors++; }
+ if (TestKernel(device, context, "TransposePadMatrix", transpose_pad_sources, precision)) { passed++; } else { errors++; }
// GEMM (in-direct)
const auto gemm_sources =