diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-01-15 17:30:00 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-01-15 17:30:00 +0100 |
commit | 4b3ffd998904f5c848edc5917308f5942fa71da3 (patch) | |
tree | f264f9e0fe241fdbe25b623c4208da222a99d973 | |
parent | 4a4be0c3a5e603a9c0c4c7aebcb660b5e62b99ad (diff) |
Added a first version of the diagonal block invert routine in preparation of TRSM
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | src/database/database.cpp | 2 | ||||
-rw-r--r-- | src/database/kernels/invert.hpp | 78 | ||||
-rw-r--r-- | src/kernels/common.opencl | 14 | ||||
-rw-r--r-- | src/kernels/level3/invert_diagonal_blocks.opencl | 440 | ||||
-rw-r--r-- | src/routines/common.hpp | 21 | ||||
-rw-r--r-- | src/routines/levelx/xinvert.cpp | 152 | ||||
-rw-r--r-- | src/routines/levelx/xinvert.hpp | 40 | ||||
-rw-r--r-- | src/utilities/utilities.cpp | 24 | ||||
-rw-r--r-- | src/utilities/utilities.hpp | 4 | ||||
-rw-r--r-- | test/correctness/routines/levelx/xinvert.cpp | 30 | ||||
-rw-r--r-- | test/performance/routines/levelx/xinvert.cpp | 37 | ||||
-rw-r--r-- | test/routines/levelx/xinvert.hpp | 219 |
13 files changed, 1062 insertions, 1 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index f6fa4045..a9cabac7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -159,7 +159,7 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm) -set(LEVELX_ROUTINES xomatcopy) +set(LEVELX_ROUTINES xomatcopy xinvert) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES}) set(PRECISIONS 32 64 3232 6464 16) diff --git a/src/database/database.cpp b/src/database/database.cpp index cf548d46..bcbd9955 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -26,6 +26,7 @@ #include "database/kernels/pad.hpp" #include "database/kernels/transpose.hpp" #include "database/kernels/padtranspose.hpp" +#include "database/kernels/invert.hpp" #include "database/kernel_selection.hpp" namespace clblast { @@ -45,6 +46,7 @@ const std::vector<const Database::DatabaseEntry*> Database::database = { &database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble, &database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble, &database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble, + &database::InvertHalf, &database::InvertSingle, &database::InvertDouble, &database::InvertComplexSingle, &database::InvertComplexDouble, &database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble }; diff --git a/src/database/kernels/invert.hpp b/src/database/kernels/invert.hpp new file mode 100644 index 00000000..2717f182 --- /dev/null +++ b/src/database/kernels/invert.hpp @@ -0,0 +1,78 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// Tuning parameters for the diagonal matrix inversion kernels +// +// ================================================================================================= + +namespace clblast { +namespace database { +// ================================================================================================= + +const Database::DatabaseEntry InvertHalf = { + "Invert", Precision::kHalf, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"INTERNAL_BLOCK_SIZE",16} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry InvertSingle = { + "Invert", Precision::kSingle, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"INTERNAL_BLOCK_SIZE",16} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry InvertComplexSingle = { + "Invert", Precision::kComplexSingle, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"INTERNAL_BLOCK_SIZE",16} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry InvertDouble = { + "Invert", Precision::kDouble, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"INTERNAL_BLOCK_SIZE",16} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry InvertComplexDouble = { + "Invert", Precision::kComplexDouble, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"INTERNAL_BLOCK_SIZE",16} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace database +} // namespace clblast diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index b0817242..bfa1042d 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -162,6 +162,13 @@ R"( #define AbsoluteValue(value) value = fabs(value) #endif +// Negation (component-wise) +#if PRECISION == 3232 || PRECISION == 6464 + #define Negate(value) value.x = -(value.x); value.y = -(value.y) +#else + #define Negate(value) value = -(value) +#endif + // Adds two complex variables #if PRECISION == 3232 || PRECISION == 6464 #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y @@ -193,6 +200,13 @@ R"( #endif #endif +// The scalar division function +#if PRECISION == 3232 || PRECISION == 6464 + #define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.x +#else + #define DivideReal(c, a, b) c = a / b +#endif + // The scalar AXPBY function #if PRECISION == 3232 || PRECISION == 6464 #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d) diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl new file mode 100644 index 00000000..9231d725 --- /dev/null +++ b/src/kernels/level3/invert_diagonal_blocks.opencl @@ -0,0 +1,440 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based +// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular +// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, +// and Jack Dongarra. +// +// ================================================================================================= +// +// Let A be an block_size*block_size lower triangular matrix, and B its inverse. +// Then the block decomposition +// +// [ A11 0 ] * [ B11 0 ] = [ I 0 ] +// [ A21 A22 ] [ B21 B22 ] [ 0 I ] +// +// yields +// +// A11*B11 = I ==> B11 = A11^{-1}, +// A22*B22 = I ==> B22 = A22^{-1}, +// A21*B11 + A22*B21 = 0 ==> B21 = -A22^{-1}*A21*B11 = -B22*A21*B11. +// +// The InvertDiagonalBlock kernel inverts A11 and A22. +// The TripleMatMul routines multiply: +// part 1: B21 = A21 * B11, +// part 2: B21 = -B22 * B21. +// +// At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each +// submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2 +// at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block, +// here current_size*2=32, +// [ B11 0 ] +// which contains [ B21 B22 ]. +// Outer blocks are block_size x block_size. +// +// A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on +// the right. This makes a single check easy to do. +// +// B is stored in workspace that is a full multiple of block_size x block_size; no checks needed. +// +// We split this into part1 & part2 to synchronize all blocks and make sure +// that writes to B12 are observed by all blocks. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_INVERT) + +#define LOCALX 17 // 16 + 1 to avoid bank conflicts +#define LOCALY 16 + +// ================================================================================================= + +__kernel __attribute__((reqd_work_group_size(8, 8, 1))) +void FillMatrix(const int n, const int ld, const int offset, + __global real* restrict dest, const real_arg arg_value) { + const real value = GetRealArg(arg_value); + const int id_one = get_global_id(0); + const int id_two = get_global_id(1); + if (id_one < ld && id_two < n) { + dest[id_two*ld + id_one + offset] = value; + } +} + +// ================================================================================================= + +// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix +__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1))) +void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld, + __global real* restrict dest, const int outer_block_size, + const int unit_diagonal, const int is_upper) +{ + const int thread_index = get_local_id(0); + const int block_index = get_group_id(0); + + // Sets the offset for this particular block in the source and destination matrices + const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset; + const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE; + const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block, + (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that + + // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE + __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // Loads the source lower triangle into local memory. Any values in the upper triangle or + // outside of the matrix are set to zero + #pragma unroll + for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) : + (thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n); + if (condition) { + lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset]; + } + else { + SetToZero(lm[thread_index][j]); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Inverts the diagonal + real inverted_diagonal; + SetToOne(inverted_diagonal); + if (unit_diagonal == 0) { + const real diagonal_value = lm[thread_index][thread_index]; + if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix + DivideReal(inverted_diagonal, inverted_diagonal, diagonal_value); + } + } + lm[thread_index][thread_index] = inverted_diagonal; + barrier(CLK_LOCAL_MEM_FENCE); + + // Upper-triangular + if (is_upper) { + + // Computes the elements 0:j-1 of the j-th column + for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) { + if (thread_index < j) { + real sum; + SetToZero(sum); + #pragma unroll + for (int k = 0; k < j; ++k) { + MultiplyAdd(sum, lm[thread_index][k], lm[k][j]); + } + real diagonal_value = lm[j][j]; + Negate(diagonal_value); + Multiply(lm[thread_index][j], diagonal_value, sum); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + + // Lower triangular + else { + + // Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column + for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) { + if (thread_index > j) { + real sum; + SetToZero(sum); + #pragma unroll + for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) { + MultiplyAdd(sum, lm[thread_index][k], lm[k][j]); + } + Multiply(lm[thread_index][j], -lm[j][j], sum); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + + // Writes the result to global memory + #pragma unroll + for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j]; + } +} + +// ================================================================================================= + +// Triple matrix-multiplication kernel: C = A * B +inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n, + __global const real* agm, __global const real* bgm, __global real* cgm, + const int lda, const int ldb, const int ldc, + int current_size, int num_pages, const int block_size) { + + // Emulates a 3D grid: NX * (NY * num_pages) + const int by = get_group_id(1) / num_pages; + const int page = get_group_id(1) % num_pages; + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); + const int iby = by*16; + const int id = lidx + lidy*get_local_size(0); + const int row = page*current_size*2 + current_size + ibx + id; + int col = page*current_size*2 + current_size; + + // Sets the offsets for this specific thread + agm += ibx + id; + bgm += lidx + (iby + lidy)*ldb; + cgm += ibx + id + iby*ldc; + + // Initializes the result registers + real cpm[16]; + #pragma unroll + for (int j = 0; j < 16; ++j) { + SetToZero(cpm[j]); + } + + // Computes NT x 16 block of C, each thread computes one 1 x 16 row + for (int k = 0; k < current_size; k += 16) { + + // Loads a 16 x 16 block of B into local memory using NX x 4 threads + #pragma unroll + for( int i=0; i < 16; i += (size/4) ) { // += get_local_size(0) + #pragma unroll + for( int j=0; j < 16; j += 4 ) { // += get_local_size(1) + blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Upper triangular + if (upper) { + + // Performs 16 x 16 multiply-add operations + #pragma unroll + for (int i = 0; i < 16; ++i) { + if (part == 2 || col++ < n) { + #pragma unroll + for (int j = 0; j < 16; ++j) { + MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]); + } + } + } + } + + // Lower triangular + else { + if (row < n) { + + // Performs 16 x 16 multiply-add operations + #pragma unroll + for (int i = 0; i < 16; ++i) { + #pragma unroll + for (int j = 0; j < 16; ++j) { + MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]); + } + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores NT x 16 results: each thread writes one 16 x 1 row + #pragma unroll + for (int i = 0; i < 16; ++i) { + if (part == 2) { Negate(cpm[i]); } + cgm[0] = cpm[i]; + cgm += ldc; + } +} + +// ================================================================================================= + +// Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower) +inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n, + __global const real* src, const int a_offset, const int lda, + __global real* dest, int current_size, int num_pages, const int block_size) { + + // Emulates a 3D grid: NX * (NY * num_pages) + const int page = get_group_id(1) % num_pages; + + // Computes the destination block offset: + // - go to the (page / pages_per_block) outer block_size * block_size block + // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that + const int pages_per_block = block_size / (current_size*2); + dest += (page / pages_per_block) * block_size * block_size + + (page % pages_per_block) * (current_size*2*block_size + current_size*2); + + // Using the GEMM notation: C = A*B + __global const real* agm; + __global const real* bgm; + __global real* cgm; + if (upper) { // upper triangular: B12 = A12 * B22 + agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda; // A12 + bgm = dest + current_size*block_size + current_size; // B22 + cgm = dest + current_size*block_size; // B12 + } + else { // lower triangular: B21 = A21 * B11 + agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size; // A21 + bgm = dest; // B11 + cgm = dest + current_size; // B21 + } + + // Runs the generic C = A * B matrix multiplication + const int ldb = block_size; + const int ldc = block_size; + TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); +} + +// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower) +inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n, + __global real* dest, int current_size, int num_pages, const int block_size) { + + // Emulates a 3D grid: NX * (NY * num_pages) + const int page = get_group_id(1) % num_pages; + + // Computes the destination block offset: + // - go to the (page / pages_per_block) outer block_size * block_size block + // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that + const int pages_per_block = block_size / (current_size*2); + dest += (page / pages_per_block) * block_size * block_size + + (page % pages_per_block) * (current_size*2*block_size + current_size*2); + + // Using the GEMM notation: C = A*B + __global const real* agm; + __global const real* bgm; + __global real* cgm; + if (upper) { // upper triangular: B12 = -B11 * B12 + agm = dest; // B11 + cgm = dest + current_size*block_size; // B12 + bgm = cgm; // B12, okay to overwrite + } + + else { // lower triangular: B21 = -B22 * B21 + agm = dest + current_size*block_size + current_size; // B22 + cgm = dest + current_size; // B21 + bgm = cgm; // B21, okay to overwrite + } + + // Runs the generic C = A * B matrix multiplication + const int lda = block_size; + const int ldb = block_size; + const int ldc = block_size; + TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); +} + +// ================================================================================================= + +// B21 = A21 * B11 +__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B21 = -B22 * B21 +__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size); +} + +// B21 = A21 * B11 +__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B21 = -B22 * B21 +__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size); +} + +// B21 = A21 * B11 +__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B21 = -B22 * B21 +__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size); +} + +// ================================================================================================= + +// B12 = A12 * B22 +__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B12 = -B11 * B12 +__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size); +} + +// B12 = A12 * B22 +__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B12 = -B11 * B12 +__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size); +} + +// B12 = A12 * B22 +__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B12 = -B11 * B12 +__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size); +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 53ca6355..47f98de0 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -33,6 +33,27 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device, // ================================================================================================= +// Sets all elements of a matrix to a constant value +template <typename T> +void FillMatrix(Queue &queue, const Device &device, + const Program &program, const Database &, + EventPointer event, const std::vector<Event> &waitForEvents, + const size_t n, const size_t ld, const size_t offset, + const Buffer<T> &dest, + const T constant_value) { + auto kernel = Kernel(program, "FillMatrix"); + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, static_cast<int>(ld)); + kernel.SetArgument(2, static_cast<int>(offset)); + kernel.SetArgument(3, dest()); + kernel.SetArgument(4, GetRealArg(constant_value)); + auto local = std::vector<size_t>{8, 8}; + auto global = std::vector<size_t>{Ceil(ld, 8), Ceil(n, 8)}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); +} + +// ================================================================================================= + // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able // to write to symmetric and triangular matrices through optional arguments. template <typename T> diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp new file mode 100644 index 00000000..5ffba958 --- /dev/null +++ b/src/routines/levelx/xinvert.cpp @@ -0,0 +1,152 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains all the common code to perform (partial) matrix inverting. This code is based +// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular +// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, +// and Jack Dongarra. +// +// ================================================================================================= + +#include "routines/levelx/xinvert.hpp" + +#include <string> +#include <vector> +#include <assert.h> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, { + #include "../../kernels/level3/invert_diagonal_blocks.opencl" + }) { +} + +// ================================================================================================= + +// Inverts diagonal square blocks of a matrix +template <typename T> +void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag, + const size_t n, const size_t block_size, + const Buffer<T> &src, const size_t offset, const size_t ld_src, + Buffer<T> &dest) { + + // Makes sure all dimensions are larger than zero and the block size is smaller than n + if ((block_size == 0) || (n == 0) || (block_size > n)) { + throw BLASError(StatusCode::kInvalidDimension); + } + + // Helper variables + const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]); + assert(internal_block_size == 16); + const auto num_blocks = CeilDiv(n, block_size); + const auto num_internal_blocks = CeilDiv(n, internal_block_size); + const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false; + + // This routine only supports block sizes which are a multiple of the internal block size and + // block sizes up to and including 128 + if ((block_size % internal_block_size != 0) || (block_size > 128)) { + throw BLASError(StatusCode::kInvalidDimension); + } + + // Checks for validity of the source and destination matrices + TestMatrixA(n, n, src, offset, ld_src); + TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size); + + // Determines which kernels to run based on the layout (the kernels assume column-major as + // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix + const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + const auto name_postfix = (is_upper) ? "Upper" : "Lower"; + + // Retrieves the program from the cache + auto event_wait_list = std::vector<Event>(); + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "INVERT"); + + // Fills the output buffer with zeros + auto fill_matrix_event = Event(); + FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), event_wait_list, + num_blocks * block_size, block_size, 0, dest, ConstantZero<T>()); + event_wait_list.push_back(fill_matrix_event); + + // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group + auto kernel = Kernel(program, "InvertDiagonalBlock"); + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, src()); + kernel.SetArgument(2, static_cast<int>(offset)); + kernel.SetArgument(3, static_cast<int>(ld_src)); + kernel.SetArgument(4, dest()); + kernel.SetArgument(5, static_cast<int>(block_size)); + kernel.SetArgument(6, static_cast<int>(unit_diagonal)); + kernel.SetArgument(7, static_cast<int>(is_upper)); + const auto local = std::vector<size_t>{internal_block_size}; + const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size}; + auto base_kernel_event = Event(); + RunKernel(kernel, queue_, device_, global, local, base_kernel_event.pointer(), event_wait_list); + event_wait_list.push_back(base_kernel_event); + + // Builds up block_size x block_size blocks. For example, internal_block_size=16: + // use 16 x 16 blocks to build 32 x 32 blocks, 1 x (1 x npages) grid, 4 x 4 threads; + // then 32 x 32 blocks to build 64 x 64 blocks, 1 x (2 x npages) grid, 8 x 4 threads; + // then 64 x 64 blocks to build 128 x 128 blocks, 1 x (4 x npages) grid, 16 x 4 threads; + for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) { + assert(current_size == 16 || current_size == 32 || current_size == 64); + + // Emulates a 3D grid: NX * (NY * npages) + const auto npages = CeilDiv(n, current_size*2); + const auto local0 = (current_size <= 32) ? current_size/4 : 16; + const auto local = std::vector<size_t>{local0, 4}; + const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]}; + + // Part 1 + auto kernel1 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix); + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, src()); + kernel1.SetArgument(2, static_cast<int>(offset)); + kernel1.SetArgument(3, static_cast<int>(ld_src)); + kernel1.SetArgument(4, dest()); + kernel1.SetArgument(5, static_cast<int>(current_size)); + kernel1.SetArgument(6, static_cast<int>(npages)); + kernel1.SetArgument(7, static_cast<int>(block_size)); + auto kernel1_event = Event(); + RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list); + event_wait_list.push_back(kernel1_event); + + // Part 2 + const bool is_last_kernel = (current_size * 2 >= block_size); + auto kernel2 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix); + kernel2.SetArgument(0, static_cast<int>(n)); + kernel2.SetArgument(1, dest()); + kernel2.SetArgument(2, static_cast<int>(current_size)); + kernel2.SetArgument(3, static_cast<int>(npages)); + kernel2.SetArgument(4, static_cast<int>(block_size)); + auto kernel2_event = Event(); + auto eventPointer = (is_last_kernel) ? event_ : kernel2_event.pointer(); + RunKernel(kernel2, queue_, device_, global, local, eventPointer, event_wait_list); + if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); } + + // Exit in case we reach beyond the bounds of the input matrix + if (current_size*2 >= n) { break; } + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xinvert<half>; +template class Xinvert<float>; +template class Xinvert<double>; +template class Xinvert<float2>; +template class Xinvert<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/levelx/xinvert.hpp b/src/routines/levelx/xinvert.hpp new file mode 100644 index 00000000..fa0a80e7 --- /dev/null +++ b/src/routines/levelx/xinvert.hpp @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains all the common code to perform (partial) matrix inverting. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XINVERT_H_ +#define CLBLAST_ROUTINES_XINVERT_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +template <typename T> +class Xinvert: public Routine { + public: + + // Constructor + Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT"); + + // Inverts diagonal square blocks of a matrix + void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag, + const size_t n, const size_t block_size, + const Buffer<T> &src, const size_t offset, const size_t ld_src, + Buffer<T> &dest); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XINVERT_H_ +#endif diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index 5e445bb9..b8e011c5 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -46,6 +46,30 @@ double2 GetScalar() { return {2.0, 0.5}; } +// Returns a scalar of value 0 +template <typename T> +T ConstantZero() { + return static_cast<T>(0.0); +} +template float ConstantZero<float>(); +template double ConstantZero<double>(); + +// Specialized version of the above for half-precision +template <> +half ConstantZero() { + return FloatToHalf(0.0f); +} + +// Specialized versions of the above for complex data-types +template <> +float2 ConstantZero() { + return {0.0f, 0.0f}; +} +template <> +double2 ConstantZero() { + return {0.0, 0.0}; +} + // Returns a scalar of value 1 template <typename T> T ConstantOne() { diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index 20587bd4..26dee23b 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -102,6 +102,10 @@ constexpr auto kArgNoAbbreviations = "no_abbrv"; template <typename T> T GetScalar(); +// Returns a scalar of value 0 +template <typename T> +T ConstantZero(); + // Returns a scalar of value 1 template <typename T> T ConstantOne(); diff --git a/test/correctness/routines/levelx/xinvert.cpp b/test/correctness/routines/levelx/xinvert.cpp new file mode 100644 index 00000000..0ccc0829 --- /dev/null +++ b/test/correctness/routines/levelx/xinvert.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/levelx/xinvert.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests<clblast::TestXinvert<float>, float, float>(argc, argv, false, "SINVERT"); + errors += clblast::RunTests<clblast::TestXinvert<double>, double, double>(argc, argv, true, "DINVERT"); + errors += clblast::RunTests<clblast::TestXinvert<float2>, float2, float2>(argc, argv, true, "CINVERT"); + errors += clblast::RunTests<clblast::TestXinvert<double2>, double2, double2>(argc, argv, true, "ZINVERT"); + errors += clblast::RunTests<clblast::TestXinvert<half>, half, half>(argc, argv, true, "HINVERT"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/performance/routines/levelx/xinvert.cpp b/test/performance/routines/levelx/xinvert.cpp new file mode 100644 index 00000000..87f36b1e --- /dev/null +++ b/test/performance/routines/levelx/xinvert.cpp @@ -0,0 +1,37 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/levelx/xinvert.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); + switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXinvert<half>, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient<clblast::TestXinvert<float>, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient<clblast::TestXinvert<double>, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient<clblast::TestXinvert<float2>, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient<clblast::TestXinvert<double2>, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp new file mode 100644 index 00000000..4408e8d5 --- /dev/null +++ b/test/routines/levelx/xinvert.hpp @@ -0,0 +1,219 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the Xinvert routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XINVERT_H_ +#define CLBLAST_TEST_ROUTINES_XINVERT_H_ + +#include <vector> +#include <string> + +#include "routines/levelx/xinvert.hpp" + +namespace clblast { +// ================================================================================================= + +template <typename T> +StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + const bool is_upper = ((args.triangle == Triangle::kUpper && args.layout != Layout::kRowMajor) || + (args.triangle == Triangle::kLower && args.layout == Layout::kRowMajor)); + + // Data transfer from OpenCL to std::vector + std::vector<T> a_mat_cpu(args.a_size, T{0.0}); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + + // Creates the output buffer + std::vector<T> b_mat_cpu(args.b_size, T{0.0}); + + // Helper variables + const auto block_size = args.m; + const auto num_blocks = CeilDiv(args.n, block_size); + const auto a_ld = args.a_ld; + const auto b_ld = block_size; + if ((block_size == 0) || (args.n == 0) || (block_size > args.n)) { + return StatusCode::kInvalidDimension; + } + + // Loops over the amount of diagonal blocks of size args.m by args.m each + for (auto block_id = size_t{0}; block_id < num_blocks; ++block_id) { + const auto a_offset = block_id * (block_size + a_ld * block_size) + args.a_offset; + const auto b_offset = block_id * block_size * block_size; + + // Inverts the diagonal elements of the matrix + for (auto i = size_t{0}; i < block_size; ++i) { + auto a_value = T{1.0}; + if (args.diagonal == Diagonal::kNonUnit) { + if (i + block_id * block_size < args.n) { + if (a_mat_cpu[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; } + a_value = T{1.0} / a_mat_cpu[i * a_ld + i + a_offset]; + } + } + b_mat_cpu[i * b_ld + i + b_offset] = a_value; + } + + // Inverts the upper triangle row by row + if (is_upper) { + for (int i = static_cast<int>(block_size) - 2; i >= 0; --i) { + for (auto j = static_cast<int>(block_size) - 1; j > i; --j) { + auto sum = T{0.0}; + for (auto k = i + 1; k <= j; ++k) { + auto a_value = T{0.0}; + if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) { + a_value = a_mat_cpu[k * a_ld + i + a_offset]; + } + sum += a_value * b_mat_cpu[j * b_ld + k + b_offset]; + } + b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset]; + } + } + } + + // Inverts the lower triangle row by row + else { + for (auto i = size_t{1}; i < block_size; ++i) { + for (auto j = size_t{0}; j < i; ++j) { + auto sum = T{0.0}; + for (auto k = j; k < i; ++k) { + auto a_value = T{0.0}; + if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) { + a_value = a_mat_cpu[k * a_ld + i + a_offset]; + } + sum += a_value * b_mat_cpu[j * b_ld + k + b_offset]; + } + b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset]; + } + } + } + } + + // Data transfer back to OpenCL + buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + return StatusCode::kSuccess; +} + +// Half-precision version calling the above reference implementation after conversions +template <> +StatusCode RunReference<half>(const Arguments<half> &args, Buffers<half> &buffers, Queue &queue) { + auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue()); + auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue()); + auto dummy = clblast::Buffer<float>(0); + auto buffers2 = Buffers<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy}; + auto args2 = Arguments<float>(); + args2.a_size = args.a_size; args2.b_size = args.b_size; + args2.a_ld = args.a_ld; args2.m = args.m; args2.n = args.n; + args2.a_offset = args.a_offset; + args2.layout = args.layout; args2.triangle = args.triangle; args2.diagonal = args.diagonal; + auto status = RunReference(args2, buffers2, queue); + FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue()); + return status; +} + +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXinvert { + public: + + // The BLAS level: 4 for the extra routines + static size_t BLASLevel() { return 4; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgN, kArgM, + kArgLayout, kArgTriangle, kArgDiagonal, + kArgALeadDim, kArgAOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments<T> &args) { + return args.n * args.a_ld + args.a_offset; + } + static size_t GetSizeB(const Arguments<T> &args) { + const auto block_size = args.m; + const auto num_blocks = CeilDiv(args.n, block_size); + return num_blocks * block_size * block_size; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.a_size = GetSizeA(args); + args.b_size = GetSizeB(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &args) { return args.n; } + static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which omatcopyose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + try { + auto event = cl_event{}; + auto inverter = Xinvert<T>(queue, &event); + inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, + args.n, args.m, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat); + clWaitForEvents(1, &event); + clReleaseEvent(event); + } catch (...) { return DispatchException(); } + return StatusCode::kSuccess; + } + + // Describes how to run a naive version of the routine (for correctness/performance comparison). + // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + return RunReference(args, buffers, queue); + } + + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + return RunReference(args, buffers, queue); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.b_size, static_cast<T>(0)); + buffers.b_mat.Read(queue, args.b_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { return args.m; } + static size_t ResultID2(const Arguments<T> &args) { return Ceil(args.n, args.m); } + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) { + return id1 * Ceil(args.n, args.m) + id2; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + const auto block_size = args.m; + const auto num_blocks = CeilDiv(args.n, block_size); + return num_blocks * (block_size * (block_size / 2) * (block_size / 2)); + } + static size_t GetBytes(const Arguments<T> &args) { + return (args.a_size * args.b_size) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XINVERT_H_ +#endif |