src/routines/levelx/xinvert.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
//   Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains all the common code to perform (partial) matrix inverting. This code is based
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
// and Jack Dongarra.
//
// =================================================================================================

#include "routines/levelx/xinvert.hpp"

#include <string>
#include <vector>
#include <assert.h>

namespace clblast {
// =================================================================================================

// Constructor: forwards to base class constructor
template <typename T>
Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
    Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
      #include "../../kernels/level3/level3.opencl"
      , // separated in multiple parts to prevent C1091 in MSVC 2013
      #include "../../kernels/level3/invert_diagonal_blocks_part1.opencl"
      , // separated in multiple parts to prevent C1091 in MSVC 2013
      #include "../../kernels/level3/invert_diagonal_blocks_part2.opencl"
    }) {
}

// =================================================================================================

// Inverts diagonal square blocks of a matrix
template <typename T>
void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
                                            const size_t n, const size_t block_size,
                                            const Buffer<T> &src, const size_t offset, const size_t ld_src,
                                            Buffer<T> &dest) {

  // Makes sure all dimensions are larger than zero
  if ((block_size == 0) || (n == 0)) {
    throw BLASError(StatusCode::kInvalidDimension);
  }

  // Some parts of this kernel are not tunable and thus require some minimal OpenCL properties
  if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16
    throw RuntimeErrorCode(StatusCode::kNotImplemented);
  }

  // Helper variables
  const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
  assert(internal_block_size == 16);
  const auto num_blocks = CeilDiv(n, block_size);
  const auto num_internal_blocks = CeilDiv(n, internal_block_size);
  const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;

  // This routine only supports block sizes which are a multiple of the internal block size and
  // block sizes up to and including 128
  if ((block_size % internal_block_size != 0) || (block_size > 128)) {
    throw BLASError(StatusCode::kUnknownError);
  }

  // Checks for validity of the source and destination matrices
  TestMatrixA(n, n, src, offset, ld_src);
  TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size);

  // Determines which kernels to run based on the layout (the kernels assume column-major as
  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
  const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
  const auto name_postfix = (is_upper) ? "Upper" : "Lower";

  // Fills the output buffer with zeros
  auto event_wait_list = std::vector<Event>();
  auto fill_matrix_event = Event();
  FillMatrix(queue_, device_, program_, fill_matrix_event.pointer(), event_wait_list,
             block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>(),
             16);
  event_wait_list.push_back(fill_matrix_event);

  // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
  auto kernel = Kernel(program_, "InvertDiagonalBlock");
  kernel.SetArgument(0, static_cast<int>(n));
  kernel.SetArgument(1, src());
  kernel.SetArgument(2, static_cast<int>(offset));
  kernel.SetArgument(3, static_cast<int>(ld_src));
  kernel.SetArgument(4, dest());
  kernel.SetArgument(5, static_cast<int>(block_size));
  kernel.SetArgument(6, static_cast<int>(unit_diagonal));
  kernel.SetArgument(7, static_cast<int>(is_upper));
  const auto local = std::vector<size_t>{internal_block_size};
  const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
  auto base_kernel_event = Event();
  auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
  RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
  if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }

  // Builds up block_size x block_size blocks. For example, internal_block_size=16:
  // use   16 x 16  blocks to build  32 x 32  blocks,  1 x (1 x npages) grid,  4 x 4 threads;
  // then  32 x 32  blocks to build  64 x 64  blocks,  1 x (2 x npages) grid,  8 x 4 threads;
  // then  64 x 64  blocks to build 128 x 128 blocks,  1 x (4 x npages) grid, 16 x 4 threads;
  for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) {
    assert(current_size == 16 || current_size == 32 || current_size == 64);

    // Emulates a 3D grid: NX * (NY * npages)
    const auto npages = CeilDiv(n, current_size*2);
    const auto local0 = (current_size <= 32) ? current_size/4 : 16;
    const auto local = std::vector<size_t>{local0, 4};
    const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};

    // Part 1
    auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
    kernel1.SetArgument(0, static_cast<int>(n));
    kernel1.SetArgument(1, src());
    kernel1.SetArgument(2, static_cast<int>(offset));
    kernel1.SetArgument(3, static_cast<int>(ld_src));
    kernel1.SetArgument(4, dest());
    kernel1.SetArgument(5, static_cast<int>(current_size));
    kernel1.SetArgument(6, static_cast<int>(npages));
    kernel1.SetArgument(7, static_cast<int>(block_size));
    auto kernel1_event = Event();
    RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list);
    event_wait_list.push_back(kernel1_event);

    // Part 2
    const bool is_last_kernel = (current_size * 2 >= block_size);
    auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix);
    kernel2.SetArgument(0, static_cast<int>(n));
    kernel2.SetArgument(1, dest());
    kernel2.SetArgument(2, static_cast<int>(current_size));
    kernel2.SetArgument(3, static_cast<int>(npages));
    kernel2.SetArgument(4, static_cast<int>(block_size));
    auto kernel2_event = Event();
    auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
    RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list);
    if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }

    // Exit in case we reach beyond the bounds of the input matrix
    if (current_size*2 >= n) { break; }
  }
}

// =================================================================================================

// Compiles the templated class
template class Xinvert<half>;
template class Xinvert<float>;
template class Xinvert<double>;
template class Xinvert<float2>;
template class Xinvert<double2>;

// =================================================================================================
} // namespace clblast