src/utilities/utilities.hpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391

// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
//   Cedric Nugteren <www.cedricnugteren.nl>
//
// This file provides declarations for the common utility functions such as a command-line
// argument parser. On top of this, it serves as the 'common' header, including the C++ OpenCL
// wrapper.
//
// =================================================================================================

#ifndef CLBLAST_UTILITIES_H_
#define CLBLAST_UTILITIES_H_

#include <string>
#include <functional>
#include <complex>
#include <random>
#include <algorithm>
#include <iterator>

#ifdef OPENCL_API
  #include "clpp11.hpp"
  #include "clblast.h"
#elif CUDA_API
  #include "cupp11.hpp"
  #include "clblast_cuda.h"
#endif
#include "clblast_half.h"
#include "utilities/clblast_exceptions.hpp"
#include "utilities/msvc.hpp"

namespace clblast {
// =================================================================================================

// Shorthands for half-precision
using half = unsigned short; // the 'cl_half' OpenCL type is actually an 'unsigned short'

// Shorthands for complex data-types
using float2 = std::complex<float>;
using double2 = std::complex<double>;

// Khronos OpenCL extensions
const std::string kKhronosAttributesAMD = "cl_amd_device_attribute_query";
const std::string kKhronosAttributesNVIDIA = "cl_nv_device_attribute_query";
const std::string kKhronosIntelSubgroups = "cl_intel_subgroups";

// Catched an unknown error
constexpr auto kUnknownError = -999;

// Canary size to add to buffers to check for buffer overflows
constexpr auto kCanarySize = 127;

// =================================================================================================

// The routine-specific arguments in string form
constexpr auto kArgM = "m";
constexpr auto kArgN = "n";
constexpr auto kArgK = "k";
constexpr auto kArgKL = "kl";
constexpr auto kArgKU = "ku";
constexpr auto kArgLayout = "layout";
constexpr auto kArgATransp = "transA";
constexpr auto kArgBTransp = "transB";
constexpr auto kArgSide = "side";
constexpr auto kArgTriangle = "triangle";
constexpr auto kArgDiagonal = "diagonal";
constexpr auto kArgKernelMode = "kernel_mode";
constexpr auto kArgXInc = "incx";
constexpr auto kArgYInc = "incy";
constexpr auto kArgXOffset = "offx";
constexpr auto kArgYOffset = "offy";
constexpr auto kArgALeadDim = "lda";
constexpr auto kArgBLeadDim = "ldb";
constexpr auto kArgCLeadDim = "ldc";
constexpr auto kArgAOffset = "offa";
constexpr auto kArgBOffset = "offb";
constexpr auto kArgCOffset = "offc";
constexpr auto kArgAPOffset = "offap";
constexpr auto kArgDotOffset = "offdot";
constexpr auto kArgNrm2Offset = "offnrm2";
constexpr auto kArgAsumOffset = "offasum";
constexpr auto kArgImaxOffset = "offimax";
constexpr auto kArgAlpha = "alpha";
constexpr auto kArgBeta = "beta";
constexpr auto kArgBatchCount = "batch_num";
constexpr auto kArgNumKernels = "num_kernels";

// Constants for im2col
constexpr auto kArgChannels = "channels";
constexpr auto kArgHeight = "height";
constexpr auto kArgWidth = "width";
constexpr auto kArgKernelH = "kernelh";
constexpr auto kArgKernelW = "kernelw";
constexpr auto kArgPadH = "padh";
constexpr auto kArgPadW = "padw";
constexpr auto kArgStrideH = "strideh";
constexpr auto kArgStrideW = "stridew";
constexpr auto kArgDilationH = "dilationh";
constexpr auto kArgDilationW = "dilationw";

// The tuner-specific arguments in string form
constexpr auto kArgFraction = "fraction";
constexpr auto kArgHeuristicSelection = "heuristic";
constexpr auto kArgMaxL2Norm = "max_l2_norm";
// PSO tuner-specific arguments in string form
constexpr auto kArgPsoSwarmSize = "pso_swarm_size";
constexpr auto kArgPsoInfGlobal = "pso_inf_global";
constexpr auto kArgPsoInfLocal = "pso_inf_local";
constexpr auto kArgPsoInfRandom = "pso_inf_random";
// Annealing tuner-specific arguments in string form
constexpr auto kArgAnnMaxTemp = "ann_max_temperature";

// The common arguments in string form
constexpr auto kArgPlatform = "platform";
constexpr auto kArgDevice = "device";
constexpr auto kArgPrecision = "precision";
constexpr auto kArgHelp = "h";
constexpr auto kArgQuiet = "q";
constexpr auto kArgNoAbbreviations = "no_abbrv";
constexpr auto kArgNumRuns = "runs";
constexpr auto kArgFullStatistics = "full_statistics";

// The buffer names
constexpr auto kBufVecX = "X";
constexpr auto kBufVecY = "Y";
constexpr auto kBufMatA = "A";
constexpr auto kBufMatB = "B";
constexpr auto kBufMatC = "C";
constexpr auto kBufMatAP = "AP";
constexpr auto kBufScalar = "Scalar";
constexpr auto kBufScalarUint = "ScalarUint";

// =================================================================================================

#ifdef VERBOSE
inline void log_debug(const std::string &log_string) {
  printf("[DEBUG] %s\n", log_string.c_str());
}
#else
inline void log_debug(const std::string&) { }
#endif


// =================================================================================================

// Converts a regular or complex type to it's base type (e.g. float2 to float)
template <typename T> struct BaseType { using Type = T; };
template <> struct BaseType<float2> { using Type = float; };
template <> struct BaseType<double2> { using Type = double; };

// =================================================================================================

// Returns a scalar with a default value
template <typename T> T GetScalar();

// Fixed value scalars
template <typename T> T ConstantZero();
template <typename T> T ConstantOne();
template <typename T> T ConstantNegOne();
template <typename T> T Constant(const double val);
template <typename T> T SmallConstant();

// Returns the absolute value of a scalar (modulus in case of complex numbers)
template <typename T> typename BaseType<T>::Type AbsoluteValue(const T value);

// =================================================================================================

// Structure containing all possible arguments for test clients, including their default values
template <typename T>
struct Arguments {
  // Routine-specific arguments
  size_t m = 1;
  size_t n = 1;
  size_t k = 1;
  size_t ku = 1;
  size_t kl = 1;
  Layout layout = Layout::kRowMajor;
  Transpose a_transpose = Transpose::kNo;
  Transpose b_transpose = Transpose::kNo;
  Side side = Side::kLeft;
  Triangle triangle = Triangle::kUpper;
  Diagonal diagonal = Diagonal::kUnit;
  KernelMode kernel_mode = KernelMode::kCrossCorrelation;
  size_t x_inc = 1;
  size_t y_inc = 1;
  size_t x_offset = 0;
  size_t y_offset = 0;
  size_t a_ld = 1;
  size_t b_ld = 1;
  size_t c_ld = 1;
  size_t a_offset = 0;
  size_t b_offset = 0;
  size_t c_offset = 0;
  size_t ap_offset = 0;
  size_t dot_offset = 0;
  size_t nrm2_offset = 0;
  size_t asum_offset = 0;
  size_t imax_offset = 0;
  T alpha = ConstantOne<T>();
  T beta = ConstantOne<T>();
  // Arguments for im2col and convgemm
  size_t channels = 1;
  size_t height = 1;
  size_t width = 1;
  size_t kernel_h = 3;
  size_t kernel_w = 3;
  size_t pad_h = 0;
  size_t pad_w = 0;
  size_t stride_h = 1;
  size_t stride_w = 1;
  size_t dilation_h = 1;
  size_t dilation_w = 1;
  size_t num_kernels = 1;
  // Batch-specific arguments
  size_t batch_count = 1;
  std::vector<size_t> x_offsets; // = {0};
  std::vector<size_t> y_offsets; // = {0};
  std::vector<size_t> a_offsets; // = {0};
  std::vector<size_t> b_offsets; // = {0};
  std::vector<size_t> c_offsets; // = {0};
  std::vector<T> alphas; // = {ConstantOne<T>()};
  std::vector<T> betas; // = {ConstantOne<T>()};
  // Sizes
  size_t x_size = 1;
  size_t y_size = 1;
  size_t a_size = 1;
  size_t b_size = 1;
  size_t c_size = 1;
  size_t ap_size = 1;
  size_t scalar_size = 1;
  // Tuner-specific arguments
  size_t heuristic_selection = 0;
  double fraction = 1.0;
  size_t pso_swarm_size = 8; 
  double pso_inf_global = 0.3;
  double pso_inf_local = 0.6;
  double pso_inf_random = 0.1;
  double ann_max_temperature = 1.0; // Is it a valid default value? 
  // Client-specific arguments
  int compare_clblas = 1;
  int compare_cblas = 1;
  int compare_cublas = 1;
  size_t step = 1;
  size_t num_steps = 0;
  size_t num_runs = 10;
  std::vector<std::string> tuner_files = {};
  bool full_statistics = false;
  #ifdef CLBLAST_REF_CUBLAS
    void* cublas_handle; // cublasHandle_t
  #endif
  // Common arguments
  size_t platform_id = 0;
  size_t device_id = 0;
  Precision precision = Precision::kSingle;
  bool print_help = false;
  bool silent = false;
  bool no_abbrv = false;
};

// =================================================================================================

// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
// data-types such as the Layout and Transpose data-types.
template <typename T>
std::string ToString(T value);

// =================================================================================================

// String splitting by a delimiter
template<typename Out>
void split(const std::string &s, char delimiter, Out result) {
  std::stringstream ss(s);
  std::string item;
  while (std::getline(ss, item, delimiter)) {
    *(result++) = item;
  }
}

// See above
inline std::vector<std::string> split(const std::string &s, char delimiter) {
  std::vector<std::string> elements;
  split(s, delimiter, std::back_inserter(elements));
  return elements;
}

// String character removal
inline void remove_character(std::string &str, char to_be_removed) {
  str.erase(std::remove(str.begin(), str.end(), to_be_removed), str.end());
}

// =================================================================================================

// Parses command-line and environmental-variable arguments into a std::vector of strings
std::vector<std::string> RetrieveCommandLineArguments(int argc, char *argv[]);

// Helper for the function "GetArgument"
template <typename T>
T ConvertArgument(const char* value);

// Variant of "ConvertArgument" with default values
template <typename T>
T ConvertArgument(const char* value, T default_value);

// Basic argument parser, matching patterns in the form of "-option value" and "--option value"
template <typename T>
T GetArgument(const std::vector<std::string> &arguments, std::string &help,
              const std::string &option, const T default_value);

// Returns the precision only
Precision GetPrecision(const std::vector<std::string> &arguments,
                       const Precision default_precision = Precision::kSingle);

// As in "GetArgument", but now only checks whether an argument is given or not
bool CheckArgument(const std::vector<std::string> &arguments, std::string &help, const std::string &option);

// =================================================================================================

// Test/example data lower and upper limit
constexpr auto kTestDataLowerLimit = -2.0;
constexpr auto kTestDataUpperLimit = 2.0;

// Populates a vector with random data
template <typename T>
void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist);

// =================================================================================================

// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
// no conversion, but half-precision is not supported as kernel argument so it is converted to float.
template <typename T> struct RealArg { using Type = T; };
template <> struct RealArg<half> { using Type = float; };
template <typename T> typename RealArg<T>::Type GetRealArg(const T value);

// =================================================================================================

// Rounding functions
size_t CeilDiv(const size_t x, const size_t y);
size_t Ceil(const size_t x, const size_t y);

// Returns whether or not 'a' is a multiple of 'b'
bool IsMultiple(const size_t a, const size_t b);

// =================================================================================================

// Convert the precision enum into bytes, e.g. a double takes up 8 bytes
size_t GetBytes(const Precision precision);

// Convert the template argument into a precision value
template <typename T>
Precision PrecisionValue();

// =================================================================================================

// Returns false is this precision is not supported by the device
template <typename T>
bool PrecisionSupported(const Device &device);

// =================================================================================================

// Retrieves the squared difference, used for example for computing the L2 error
template <typename T>
double SquaredDifference(const T val1, const T val2);

// =================================================================================================

// Device information in a specific CLBlast form
std::string GetDeviceType(const Device& device);
std::string GetDeviceVendor(const Device& device);
std::string GetDeviceArchitecture(const Device& device);
std::string GetDeviceName(const Device& device);

// =================================================================================================

void SetOpenCLKernelStandard(const Device &device, std::vector<std::string> &options);

// =================================================================================================

// Solve Bezout's identity
// a * p + b * q = r = GCD(a, b)
void EuclidGCD(int a, int b, int &p, int &q, int &r);

// =================================================================================================
} // namespace clblast

// CLBLAST_UTILITIES_H_
#endif