diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-12-23 14:18:07 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-12-23 14:18:07 +0100 |
commit | 736399e528e9624850e383da468bb3931a5944c3 (patch) | |
tree | f506a95d0642397cdbb2939c2c4ae7802f8baf37 /src/kernels/level3 | |
parent | b1f52f130c4f9e4346579003b2786aa2e082f234 (diff) |
Split the invert kernel in two parts to prevent error C1091 in MSVC 2013
Diffstat (limited to 'src/kernels/level3')
-rw-r--r-- | src/kernels/level3/invert_diagonal_blocks_part1.opencl (renamed from src/kernels/level3/invert_diagonal_blocks.opencl) | 108 | ||||
-rw-r--r-- | src/kernels/level3/invert_diagonal_blocks_part2.opencl | 131 |
2 files changed, 133 insertions, 106 deletions
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl index e8f0ea91..040fcc83 100644 --- a/src/kernels/level3/invert_diagonal_blocks.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl @@ -12,6 +12,8 @@ // Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, // and Jack Dongarra. // +// This is part 1 of 2, see part 2 for the remainder of the kernel code. +// // ================================================================================================= // // Let A be an block_size*block_size lower triangular matrix, and B its inverse. @@ -336,112 +338,6 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); } -// ================================================================================================= - -// B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, - __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); -} - -// B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size); -} - -// B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, - __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); -} - -// B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size); -} - -// B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, - __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); -} - -// B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size); -} - -// ================================================================================================= - -// B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, - __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); -} - -// B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size); -} - -// B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, - __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); -} - -// B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size); -} - -// B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, - __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); -} - -// B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) -void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) -{ - __local real lm[LOCALY * LOCALX]; - TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size); -} - #endif // ================================================================================================= diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl new file mode 100644 index 00000000..8736203c --- /dev/null +++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl @@ -0,0 +1,131 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This is part 2 of 2, see part 1 of the invert kernel for a description +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_INVERT) + +// B21 = A21 * B11 +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B21 = -B22 * B21 +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size); +} + +// B21 = A21 * B11 +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B21 = -B22 * B21 +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size); +} + +// B21 = A21 * B11 +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B21 = -B22 * B21 +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size); +} + +// ================================================================================================= + +// B12 = A12 * B22 +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B12 = -B11 * B12 +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size); +} + +// B12 = A12 * B22 +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B12 = -B11 * B12 +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size); +} + +// B12 = A12 * B22 +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B12 = -B11 * B12 +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) +void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size); +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= |