summaryrefslogtreecommitdiff
path: root/src/kernels
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-01-15 17:30:00 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-01-15 17:30:00 +0100
commit4b3ffd998904f5c848edc5917308f5942fa71da3 (patch)
treef264f9e0fe241fdbe25b623c4208da222a99d973 /src/kernels
parent4a4be0c3a5e603a9c0c4c7aebcb660b5e62b99ad (diff)
Added a first version of the diagonal block invert routine in preparation of TRSM
Diffstat (limited to 'src/kernels')
-rw-r--r--src/kernels/common.opencl14
-rw-r--r--src/kernels/level3/invert_diagonal_blocks.opencl440
2 files changed, 454 insertions, 0 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index b0817242..bfa1042d 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -162,6 +162,13 @@ R"(
#define AbsoluteValue(value) value = fabs(value)
#endif
+// Negation (component-wise)
+#if PRECISION == 3232 || PRECISION == 6464
+ #define Negate(value) value.x = -(value.x); value.y = -(value.y)
+#else
+ #define Negate(value) value = -(value)
+#endif
+
// Adds two complex variables
#if PRECISION == 3232 || PRECISION == 6464
#define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
@@ -193,6 +200,13 @@ R"(
#endif
#endif
+// The scalar division function
+#if PRECISION == 3232 || PRECISION == 6464
+ #define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.x
+#else
+ #define DivideReal(c, a, b) c = a / b
+#endif
+
// The scalar AXPBY function
#if PRECISION == 3232 || PRECISION == 6464
#define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
new file mode 100644
index 00000000..9231d725
--- /dev/null
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -0,0 +1,440 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
+//
+// =================================================================================================
+//
+// Let A be an block_size*block_size lower triangular matrix, and B its inverse.
+// Then the block decomposition
+//
+// [ A11 0 ] * [ B11 0 ] = [ I 0 ]
+// [ A21 A22 ] [ B21 B22 ] [ 0 I ]
+//
+// yields
+//
+// A11*B11 = I ==> B11 = A11^{-1},
+// A22*B22 = I ==> B22 = A22^{-1},
+// A21*B11 + A22*B21 = 0 ==> B21 = -A22^{-1}*A21*B11 = -B22*A21*B11.
+//
+// The InvertDiagonalBlock kernel inverts A11 and A22.
+// The TripleMatMul routines multiply:
+// part 1: B21 = A21 * B11,
+// part 2: B21 = -B22 * B21.
+//
+// At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each
+// submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2
+// at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block,
+// here current_size*2=32,
+// [ B11 0 ]
+// which contains [ B21 B22 ].
+// Outer blocks are block_size x block_size.
+//
+// A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on
+// the right. This makes a single check easy to do.
+//
+// B is stored in workspace that is a full multiple of block_size x block_size; no checks needed.
+//
+// We split this into part1 & part2 to synchronize all blocks and make sure
+// that writes to B12 are observed by all blocks.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_INVERT)
+
+#define LOCALX 17 // 16 + 1 to avoid bank conflicts
+#define LOCALY 16
+
+// =================================================================================================
+
+__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
+void FillMatrix(const int n, const int ld, const int offset,
+ __global real* restrict dest, const real_arg arg_value) {
+ const real value = GetRealArg(arg_value);
+ const int id_one = get_global_id(0);
+ const int id_two = get_global_id(1);
+ if (id_one < ld && id_two < n) {
+ dest[id_two*ld + id_one + offset] = value;
+ }
+}
+
+// =================================================================================================
+
+// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
+__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
+ __global real* restrict dest, const int outer_block_size,
+ const int unit_diagonal, const int is_upper)
+{
+ const int thread_index = get_local_id(0);
+ const int block_index = get_group_id(0);
+
+ // Sets the offset for this particular block in the source and destination matrices
+ const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
+ const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
+ const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
+ (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
+
+ // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
+ __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+ // Loads the source lower triangle into local memory. Any values in the upper triangle or
+ // outside of the matrix are set to zero
+ #pragma unroll
+ for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+ const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) :
+ (thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+ if (condition) {
+ lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset];
+ }
+ else {
+ SetToZero(lm[thread_index][j]);
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Inverts the diagonal
+ real inverted_diagonal;
+ SetToOne(inverted_diagonal);
+ if (unit_diagonal == 0) {
+ const real diagonal_value = lm[thread_index][thread_index];
+ if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
+ DivideReal(inverted_diagonal, inverted_diagonal, diagonal_value);
+ }
+ }
+ lm[thread_index][thread_index] = inverted_diagonal;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Upper-triangular
+ if (is_upper) {
+
+ // Computes the elements 0:j-1 of the j-th column
+ for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) {
+ if (thread_index < j) {
+ real sum;
+ SetToZero(sum);
+ #pragma unroll
+ for (int k = 0; k < j; ++k) {
+ MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
+ }
+ real diagonal_value = lm[j][j];
+ Negate(diagonal_value);
+ Multiply(lm[thread_index][j], diagonal_value, sum);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ }
+
+ // Lower triangular
+ else {
+
+ // Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column
+ for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) {
+ if (thread_index > j) {
+ real sum;
+ SetToZero(sum);
+ #pragma unroll
+ for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
+ MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
+ }
+ Multiply(lm[thread_index][j], -lm[j][j], sum);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ }
+
+ // Writes the result to global memory
+ #pragma unroll
+ for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+ dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
+ }
+}
+
+// =================================================================================================
+
+// Triple matrix-multiplication kernel: C = A * B
+inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
+ __global const real* agm, __global const real* bgm, __global real* cgm,
+ const int lda, const int ldb, const int ldc,
+ int current_size, int num_pages, const int block_size) {
+
+ // Emulates a 3D grid: NX * (NY * num_pages)
+ const int by = get_group_id(1) / num_pages;
+ const int page = get_group_id(1) % num_pages;
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1));
+ const int iby = by*16;
+ const int id = lidx + lidy*get_local_size(0);
+ const int row = page*current_size*2 + current_size + ibx + id;
+ int col = page*current_size*2 + current_size;
+
+ // Sets the offsets for this specific thread
+ agm += ibx + id;
+ bgm += lidx + (iby + lidy)*ldb;
+ cgm += ibx + id + iby*ldc;
+
+ // Initializes the result registers
+ real cpm[16];
+ #pragma unroll
+ for (int j = 0; j < 16; ++j) {
+ SetToZero(cpm[j]);
+ }
+
+ // Computes NT x 16 block of C, each thread computes one 1 x 16 row
+ for (int k = 0; k < current_size; k += 16) {
+
+ // Loads a 16 x 16 block of B into local memory using NX x 4 threads
+ #pragma unroll
+ for( int i=0; i < 16; i += (size/4) ) { // += get_local_size(0)
+ #pragma unroll
+ for( int j=0; j < 16; j += 4 ) { // += get_local_size(1)
+ blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb];
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Upper triangular
+ if (upper) {
+
+ // Performs 16 x 16 multiply-add operations
+ #pragma unroll
+ for (int i = 0; i < 16; ++i) {
+ if (part == 2 || col++ < n) {
+ #pragma unroll
+ for (int j = 0; j < 16; ++j) {
+ MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+ }
+ }
+ }
+ }
+
+ // Lower triangular
+ else {
+ if (row < n) {
+
+ // Performs 16 x 16 multiply-add operations
+ #pragma unroll
+ for (int i = 0; i < 16; ++i) {
+ #pragma unroll
+ for (int j = 0; j < 16; ++j) {
+ MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+ }
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ // Stores NT x 16 results: each thread writes one 16 x 1 row
+ #pragma unroll
+ for (int i = 0; i < 16; ++i) {
+ if (part == 2) { Negate(cpm[i]); }
+ cgm[0] = cpm[i];
+ cgm += ldc;
+ }
+}
+
+// =================================================================================================
+
+// Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
+inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
+ __global const real* src, const int a_offset, const int lda,
+ __global real* dest, int current_size, int num_pages, const int block_size) {
+
+ // Emulates a 3D grid: NX * (NY * num_pages)
+ const int page = get_group_id(1) % num_pages;
+
+ // Computes the destination block offset:
+ // - go to the (page / pages_per_block) outer block_size * block_size block
+ // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
+ const int pages_per_block = block_size / (current_size*2);
+ dest += (page / pages_per_block) * block_size * block_size +
+ (page % pages_per_block) * (current_size*2*block_size + current_size*2);
+
+ // Using the GEMM notation: C = A*B
+ __global const real* agm;
+ __global const real* bgm;
+ __global real* cgm;
+ if (upper) { // upper triangular: B12 = A12 * B22
+ agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda; // A12
+ bgm = dest + current_size*block_size + current_size; // B22
+ cgm = dest + current_size*block_size; // B12
+ }
+ else { // lower triangular: B21 = A21 * B11
+ agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size; // A21
+ bgm = dest; // B11
+ cgm = dest + current_size; // B21
+ }
+
+ // Runs the generic C = A * B matrix multiplication
+ const int ldb = block_size;
+ const int ldc = block_size;
+ TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+}
+
+// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
+inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
+ __global real* dest, int current_size, int num_pages, const int block_size) {
+
+ // Emulates a 3D grid: NX * (NY * num_pages)
+ const int page = get_group_id(1) % num_pages;
+
+ // Computes the destination block offset:
+ // - go to the (page / pages_per_block) outer block_size * block_size block
+ // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
+ const int pages_per_block = block_size / (current_size*2);
+ dest += (page / pages_per_block) * block_size * block_size +
+ (page % pages_per_block) * (current_size*2*block_size + current_size*2);
+
+ // Using the GEMM notation: C = A*B
+ __global const real* agm;
+ __global const real* bgm;
+ __global real* cgm;
+ if (upper) { // upper triangular: B12 = -B11 * B12
+ agm = dest; // B11
+ cgm = dest + current_size*block_size; // B12
+ bgm = cgm; // B12, okay to overwrite
+ }
+
+ else { // lower triangular: B21 = -B22 * B21
+ agm = dest + current_size*block_size + current_size; // B22
+ cgm = dest + current_size; // B21
+ bgm = cgm; // B21, okay to overwrite
+ }
+
+ // Runs the generic C = A * B matrix multiplication
+ const int lda = block_size;
+ const int ldb = block_size;
+ const int ldc = block_size;
+ TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+}
+
+// =================================================================================================
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// =================================================================================================
+
+// B12 = A12 * B22
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 = A12 * B22
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 = A12 * B22
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================