summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt37
-rw-r--r--src/routines/common.cpp78
-rw-r--r--src/routines/common.hpp7
-rw-r--r--src/tuning/tuning.hpp1
-rw-r--r--src/utilities/compile.cpp99
-rw-r--r--src/utilities/compile.hpp36
-rw-r--r--src/utilities/timing.hpp1
7 files changed, 163 insertions, 96 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ba512eb..f051e441 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -224,6 +224,7 @@ endif()
set(SOURCES
src/database/database.cpp
src/routines/common.cpp
+ src/utilities/compile.cpp
src/utilities/clblast_exceptions.cpp
src/utilities/timing.cpp
src/utilities/utilities.cpp
@@ -244,6 +245,7 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual
src/routines/common.hpp
src/routines/routines.hpp
src/utilities/buffer_test.hpp
+ src/utilities/compile.hpp
src/utilities/clblast_exceptions.hpp
src/utilities/device_mapping.hpp
src/utilities/msvc.hpp
@@ -366,29 +368,42 @@ endif()
# ==================================================================================================
# This section contains all the code related to the tuners
-# TODO: Remove dependency on CLBlast
if(TUNERS)
set(TUNERS_COMMON
+ src/utilities/compile.cpp
+ src/utilities/clblast_exceptions.cpp
+ src/utilities/timing.cpp
+ src/utilities/utilities.cpp
src/tuning/configurations.cpp
+ src/tuning/tuning.cpp)
+ set(TUNERS_HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio
+ src/utilities/compile.hpp
+ src/utilities/clblast_exceptions.hpp
+ src/utilities/timing.hpp
+ src/utilities/utilities.hpp
src/tuning/configurations.hpp
- src/tuning/tuning.cpp
src/tuning/tuning.hpp)
- # Visual Studio requires the sources of non-exported objects/libraries
- if(MSVC)
- set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
- endif()
-
# Adds tuning executables
foreach(KERNEL ${KERNELS})
- add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
- target_link_libraries(clblast_tuner_${KERNEL} clblast ${API_LIBRARIES})
+ add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} ${TUNERS_HEADERS}
+ src/tuning/kernels/${KERNEL}.cpp)
+ target_include_directories(clblast_tuner_${KERNEL} PUBLIC
+ $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
+ $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
+ ${API_INCLUDE_DIRS})
+ target_link_libraries(clblast_tuner_${KERNEL} ${API_LIBRARIES})
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
endforeach()
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
- add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
- target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${API_LIBRARIES})
+ add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} ${TUNERS_HEADERS}
+ src/tuning/routines/${ROUTINE_TUNER}.cpp)
+ target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC
+ $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
+ $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
+ ${API_INCLUDE_DIRS})
+ target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} ${API_LIBRARIES})
install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
endforeach()
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index c415d9fd..5b178e53 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -19,84 +19,6 @@
namespace clblast {
// =================================================================================================
-// Compiles a program from source code
-Program CompileFromSource(const std::string &source_string, const Precision precision,
- const std::string &routine_name,
- const Device& device, const Context& context,
- std::vector<std::string>& options) {
- auto header_string = std::string{""};
-
- header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
-
- // Adds the name of the routine as a define
- header_string += "#define ROUTINE_" + routine_name + "\n";
-
- // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
- // which it is known to work with all OpenCL platforms.
- if (device.IsNVIDIA() || device.IsARM()) {
- header_string += "#define USE_INLINE_KEYWORD 1\n";
- }
-
- // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
- // performance, but might result in a reduced accuracy.
- if (device.IsAMD() && device.IsGPU()) {
- header_string += "#define USE_CL_MAD 1\n";
- }
-
- // For specific devices, use staggered/shuffled workgroup indices.
- if (device.IsAMD() && device.IsGPU()) {
- header_string += "#define USE_STAGGERED_INDICES 1\n";
- }
-
- // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
- // performance through better cache behaviour
- if (device.IsARM() && device.IsGPU()) {
- header_string += "#define GLOBAL_MEM_FENCE 1\n";
- }
-
- // Optionally adds a translation header from OpenCL kernels to CUDA kernels
- #ifdef CUDA_API
- source_string +=
- #include "kernels/opencl_to_cuda.h"
- ;
- #endif
-
- // Loads the common header (typedefs and defines and such)
- header_string +=
- #include "kernels/common.opencl"
- ;
-
- // Prints details of the routine to compile in case of debugging in verbose mode
- #ifdef VERBOSE
- printf("[DEBUG] Compiling routine '%s-%s'\n",
- routine_name.c_str(), ToString(precision).c_str());
- const auto start_time = std::chrono::steady_clock::now();
- #endif
-
- // Compiles the kernel
- auto program = Program(context, header_string + source_string);
- try {
- program.Build(device, options);
- } catch (const CLCudaAPIBuildError &e) {
- if (program.StatusIsCompilationWarningOrError(e.status())) {
- fprintf(stdout, "OpenCL compiler error/warning: %s\n",
- program.GetBuildInfo(device).c_str());
- }
- throw;
- }
-
- // Prints the elapsed compilation time in case of debugging in verbose mode
- #ifdef VERBOSE
- const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
- const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
- printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
- #endif
-
- return program;
-}
-
-// =================================================================================================
-
// Enqueues a kernel, waits for completion, and checks for errors
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 8a93d74a..06d001d9 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -20,17 +20,12 @@
#include <vector>
#include "utilities/utilities.hpp"
+#include "utilities/compile.hpp"
#include "database/database.hpp"
namespace clblast {
// =================================================================================================
-// Compiles a program from source code
-Program CompileFromSource(const std::string &source_string, const Precision precision,
- const std::string &routine_name,
- const Device& device, const Context& context,
- std::vector<std::string>& options);
-
// Enqueues a kernel, waits for completion, and checks for errors
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 83f08ea9..c8a12b5b 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -23,6 +23,7 @@
#include <iostream>
#include "utilities/utilities.hpp"
+#include "utilities/compile.hpp"
#include "utilities/timing.hpp"
#include "tuning/configurations.hpp"
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
new file mode 100644
index 00000000..3c02d316
--- /dev/null
+++ b/src/utilities/compile.cpp
@@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the kernel compilation functions (see the header for more information).
+//
+// =================================================================================================
+
+#include <vector>
+#include <chrono>
+
+#include "routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+ const std::string &routine_name,
+ const Device& device, const Context& context,
+ std::vector<std::string>& options) {
+ auto header_string = std::string{""};
+
+ header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
+
+ // Adds the name of the routine as a define
+ header_string += "#define ROUTINE_" + routine_name + "\n";
+
+ // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
+ // which it is known to work with all OpenCL platforms.
+ if (device.IsNVIDIA() || device.IsARM()) {
+ header_string += "#define USE_INLINE_KEYWORD 1\n";
+ }
+
+ // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
+ // performance, but might result in a reduced accuracy.
+ if (device.IsAMD() && device.IsGPU()) {
+ header_string += "#define USE_CL_MAD 1\n";
+ }
+
+ // For specific devices, use staggered/shuffled workgroup indices.
+ if (device.IsAMD() && device.IsGPU()) {
+ header_string += "#define USE_STAGGERED_INDICES 1\n";
+ }
+
+ // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+ // performance through better cache behaviour
+ if (device.IsARM() && device.IsGPU()) {
+ header_string += "#define GLOBAL_MEM_FENCE 1\n";
+ }
+
+ // Optionally adds a translation header from OpenCL kernels to CUDA kernels
+ #ifdef CUDA_API
+ source_string +=
+ #include "kernels/opencl_to_cuda.h"
+ ;
+ #endif
+
+ // Loads the common header (typedefs and defines and such)
+ header_string +=
+ #include "kernels/common.opencl"
+ ;
+
+ // Prints details of the routine to compile in case of debugging in verbose mode
+ #ifdef VERBOSE
+ printf("[DEBUG] Compiling routine '%s-%s'\n",
+ routine_name.c_str(), ToString(precision).c_str());
+ const auto start_time = std::chrono::steady_clock::now();
+ #endif
+
+ // Compiles the kernel
+ auto program = Program(context, header_string + source_string);
+ try {
+ program.Build(device, options);
+ } catch (const CLCudaAPIBuildError &e) {
+ if (program.StatusIsCompilationWarningOrError(e.status())) {
+ fprintf(stdout, "OpenCL compiler error/warning: %s\n",
+ program.GetBuildInfo(device).c_str());
+ }
+ throw;
+ }
+
+ // Prints the elapsed compilation time in case of debugging in verbose mode
+ #ifdef VERBOSE
+ const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+ const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+ printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
+ #endif
+
+ return program;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp
new file mode 100644
index 00000000..bd4686eb
--- /dev/null
+++ b/src/utilities/compile.hpp
@@ -0,0 +1,36 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the CLBlast way to compile a kernel from source, used for the library and for
+// the auto-tuners.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_UTILITIES_COMPILE_H_
+#define CLBLAST_UTILITIES_COMPILE_H_
+
+#include <string>
+#include <vector>
+
+#include "utilities/utilities.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Compiles a program from source code
+Program CompileFromSource(const std::string &source_string, const Precision precision,
+ const std::string &routine_name,
+ const Device& device, const Context& context,
+ std::vector<std::string>& options);
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_UTILITIES_COMPILE_H_
+#endif
diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp
index 3a5e2cff..e8040058 100644
--- a/src/utilities/timing.hpp
+++ b/src/utilities/timing.hpp
@@ -21,7 +21,6 @@
#include <chrono>
#include "utilities/utilities.hpp"
-#include "routines/common.hpp"
namespace clblast {
// =================================================================================================