summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/kernels/level3/invert_diagonal_blocks_part1.opencl (renamed from src/kernels/level3/invert_diagonal_blocks.opencl)108
-rw-r--r--src/kernels/level3/invert_diagonal_blocks_part2.opencl131
-rw-r--r--src/routines/levelx/xinvert.cpp4
-rw-r--r--src/tuning/kernels/invert.cpp3
4 files changed, 138 insertions, 108 deletions
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index e8f0ea91..040fcc83 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -12,6 +12,8 @@
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
// and Jack Dongarra.
//
+// This is part 1 of 2, see part 2 for the remainder of the kernel code.
+//
// =================================================================================================
//
// Let A be an block_size*block_size lower triangular matrix, and B its inverse.
@@ -336,112 +338,6 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r
TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
}
-// =================================================================================================
-
-// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
- __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
-}
-
-// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
-}
-
-// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
- __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
-}
-
-// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
-}
-
-// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
- __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
-}
-
-// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
-}
-
-// =================================================================================================
-
-// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
- __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
-}
-
-// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
-}
-
-// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
- __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
-}
-
-// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
-}
-
-// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
- __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
-}
-
-// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
-void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
-{
- __local real lm[LOCALY * LOCALX];
- TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
-}
-
#endif
// =================================================================================================
diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
new file mode 100644
index 00000000..8736203c
--- /dev/null
+++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
@@ -0,0 +1,131 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 2 of 2, see part 1 of the invert kernel for a description
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_INVERT)
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// =================================================================================================
+
+// B12 = A12 * B22
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 = A12 * B22
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 = A12 * B22
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+ __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+ __local real lm[LOCALY * LOCALX];
+ TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
index 5fbc5fe5..a5ef9e10 100644
--- a/src/routines/levelx/xinvert.cpp
+++ b/src/routines/levelx/xinvert.cpp
@@ -29,7 +29,9 @@ Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
, // separated in multiple parts to prevent C1091 in MSVC 2013
- #include "../../kernels/level3/invert_diagonal_blocks.opencl"
+ #include "../../kernels/level3/invert_diagonal_blocks_part1.opencl"
+ , // separated in multiple parts to prevent C1091 in MSVC 2013
+ #include "../../kernels/level3/invert_diagonal_blocks_part2.opencl"
}) {
}
diff --git a/src/tuning/kernels/invert.cpp b/src/tuning/kernels/invert.cpp
index cce2fc8d..c292144a 100644
--- a/src/tuning/kernels/invert.cpp
+++ b/src/tuning/kernels/invert.cpp
@@ -40,7 +40,8 @@ TunerSettings GetTunerSettings(const int, const Arguments<T> &args) {
settings.kernel_name = "TripleMatMul16Part1Lower";
settings.sources =
"#define ROUTINE_INVERT"
-#include "../src/kernels/level3/invert_diagonal_blocks.opencl"
+#include "../src/kernels/level3/invert_diagonal_blocks_part1.opencl"
+#include "../src/kernels/level3/invert_diagonal_blocks_part2.opencl"
;
// Buffer sizes