1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the kernel compilation functions (see the header for more information).
//
// =================================================================================================
#include <vector>
#include <chrono>
#include "routines/common.hpp"
#include "kernel_preprocessor.hpp"
namespace clblast {
// =================================================================================================
// Compiles a program from source code
std::shared_ptr<Program> CompileFromSource(
const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options,
const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never
const bool silent) {
auto header_string = std::string{""};
header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
// Adds the name of the routine as a define
header_string += "#define ROUTINE_" + routine_name + "\n";
// Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
// which it is known to work with all OpenCL platforms.
if (device.IsNVIDIA() || device.IsARM()) {
header_string += "#define USE_INLINE_KEYWORD 1\n";
}
// For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device.IsAMD() && device.IsGPU()) {
header_string += "#define USE_CL_MAD 1\n";
}
// For specific devices, use staggered/shuffled workgroup indices.
if (device.IsAMD() && device.IsGPU()) {
header_string += "#define USE_STAGGERED_INDICES 1\n";
}
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
if (device.IsARM() && device.IsGPU()) {
header_string += "#define GLOBAL_MEM_FENCE 1\n";
}
// For Intel GPUs with subgroup support, use subgroup shuffling.
if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) {
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
}
// Optionally adds a translation header from OpenCL kernels to CUDA kernels
#ifdef CUDA_API
header_string +=
#include "kernels/opencl_to_cuda.h"
;
#endif
// Loads the common header (typedefs and defines and such)
header_string +=
#include "kernels/common.opencl"
;
// Prints details of the routine to compile in case of debugging in verbose mode
#ifdef VERBOSE
printf("[DEBUG] Compiling routine '%s-%s'\n",
routine_name.c_str(), ToString(precision).c_str());
const auto start_time = std::chrono::steady_clock::now();
#endif
// Runs a pre-processor to unroll loops and perform array-to-register promotion. Most OpenCL
// compilers do this, but some don't.
auto do_run_preprocessor = false;
if (run_preprocessor == 0) { do_run_preprocessor = (device.IsARM() && device.IsGPU()); }
if (run_preprocessor == 1) { do_run_preprocessor = true; }
auto kernel_string = header_string + source_string;
if (do_run_preprocessor) {
log_debug("Running built-in pre-processor");
kernel_string = PreprocessKernelSource(kernel_string);
}
// Compiles the kernel
auto program = std::make_shared<Program>(context, kernel_string);
try {
program->Build(device, options);
} catch (const CLCudaAPIBuildError &e) {
if (program->StatusIsCompilationWarningOrError(e.status()) && !silent) {
fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
program->GetBuildInfo(device).c_str());
}
throw;
}
// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE
const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
#endif
return program;
}
// =================================================================================================
} // namespace clblast
|