From 0157d6d4ea50a789457637f98a373d31b62c1496 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Thu, 16 Jul 2015 22:42:02 +0200 Subject: Using mad() instruction for AMD devices like clBLAS does --- src/kernels/common.opencl | 7 +++++-- src/routine.cc | 8 ++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 0d29c7a6..2e1d8f90 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -82,8 +82,11 @@ R"( // ================================================================================================= -// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction -#define USE_CL_MAD 0 +// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific +// devices, this is enabled (see src/routine.cc). +#ifndef USE_CL_MAD + #define USE_CL_MAD 0 +#endif // Sets a variable to zero #if PRECISION == 3232 || PRECISION == 6464 diff --git a/src/routine.cc b/src/routine.cc index 339027d4..27bfa8f9 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -68,6 +68,14 @@ StatusCode Routine::SetUp(const std::string &routine_source) { // Collects the parameters for this device in the form of defines, and adds the precision auto defines = db_.GetDefines(); defines += "#define PRECISION "+ToString(static_cast(precision_))+"\n"; + + // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve + // performance, but might result in a reduced accuracy. + if (device_.Vendor() == "AMD") { + defines += "#define USE_CL_MAD 1\n"; + } + + // Combines everything together into a single source string auto source_string = defines + common_header + routine_source; // Compiles the kernel -- cgit v1.2.3