src/utilities/compile.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
//   Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the kernel compilation functions (see the header for more information).
//
// =================================================================================================

#include <vector>
#include <chrono>

#include "routines/common.hpp"
#include "kernel_preprocessor.hpp"

namespace clblast {
// =================================================================================================

// Compiles a program from source code
std::shared_ptr<Program> CompileFromSource(
                          const std::string &source_string, const Precision precision,
                          const std::string &routine_name,
                          const Device& device, const Context& context,
                          std::vector<std::string>& options,
                          const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never
                          const bool silent) {
  auto header_string = std::string{""};

  header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";

  // Adds the name of the routine as a define
  header_string += "#define ROUTINE_" + routine_name + "\n";

  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
  // which it is known to work with all OpenCL platforms.
  if (device.IsNVIDIA() || device.IsARM() || device.IsQualcomm()) {
    header_string += "#define USE_INLINE_KEYWORD 1\n";
  }

  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
  // performance, but might result in a reduced accuracy.
  if ((device.IsAMD() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) {
    header_string += "#define USE_CL_MAD 1\n";
  }

  // For specific devices, use staggered/shuffled workgroup indices.
  if (device.IsAMD() && device.IsGPU()) {
    header_string += "#define USE_STAGGERED_INDICES 1\n";
  }

  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
  // performance through better cache behaviour
  if ((device.IsARM() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) {
    header_string += "#define GLOBAL_MEM_FENCE 1\n";
  }

  // For Intel GPUs with subgroup support, use subgroup shuffling.
  if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups) &&
      (precision == Precision::kSingle || precision == Precision::kHalf)) {
    header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
    header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n";
  }

  // For NVIDIA GPUs, inline PTX can provide subgroup support
  if (device.IsGPU() && device.IsNVIDIA() && precision == Precision::kSingle) {
    header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";

    // Nvidia needs to check pre or post volta due to new shuffle commands
    if (device.IsPostNVIDIAVolta()) {
      header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 1\n";
    }
    else {
      header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n";
    }
  }

  // For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance.
  // This option compiles without the workgroup size requirement and does not affect correctness.
  if (device.IsQualcomm()) {
    header_string += "#define RELAX_WORKGROUP_SIZE 1\n";
  }
  
  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
  #ifdef CUDA_API
    header_string +=
      #include "kernels/opencl_to_cuda.h"
    ;
  #endif

  // Loads the common header (typedefs and defines and such)
  header_string +=
    #include "kernels/common.opencl"
  ;

  // Prints details of the routine to compile in case of debugging in verbose mode
  #ifdef VERBOSE
    printf("[DEBUG] Compiling routine '%s-%s'\n",
           routine_name.c_str(), ToString(precision).c_str());
    const auto start_time = std::chrono::steady_clock::now();
  #endif

  // Runs a pre-processor to unroll loops and perform array-to-register promotion. Most OpenCL
  // compilers do this, but some don't.
  auto do_run_preprocessor = false;
  if (run_preprocessor == 0) { do_run_preprocessor = (device.IsARM() && device.IsGPU()); }
  if (run_preprocessor == 1) { do_run_preprocessor = true; }
  auto kernel_string = header_string + source_string;
  if (do_run_preprocessor) {
    log_debug("Running built-in pre-processor");
    kernel_string = PreprocessKernelSource(kernel_string);
  }

  // Compiles the kernel
  auto program = std::make_shared<Program>(context, kernel_string);
  try {
    SetOpenCLKernelStandard(device, options);
    program->Build(device, options);
  } catch (const CLCudaAPIBuildError &e) {
    if (program->StatusIsCompilationWarningOrError(e.status()) && !silent) {
      fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
              program->GetBuildInfo(device).c_str());
    }
    throw;
  }

  // Prints the elapsed compilation time in case of debugging in verbose mode
  #ifdef VERBOSE
    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
  #endif

  return program;
}

// =================================================================================================
} // namespace clblast