src/kernels/level3/invert_diagonal_blocks_part2.opencl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
//   Cedric Nugteren <www.cedricnugteren.nl>
//
// This is part 2 of 2, see part 1 of the invert kernel for a description
//
// =================================================================================================

// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(

// =================================================================================================
#if defined(ROUTINE_INVERT)

// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}

// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
}

// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}

// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
}

// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}

// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
}

// =================================================================================================

// B12 =  A12 * B22
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}

// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
}

// B12 =  A12 * B22
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}

// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
}

// B12 =  A12 * B22
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}

// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
  __local real lm[LOCALY * LOCALX];
  TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
}

#endif
// =================================================================================================

// End of the C++11 raw string literal
)"

// =================================================================================================