summaryrefslogtreecommitdiff
path: root/src/kernels
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-12-31 16:10:40 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-12-31 16:10:40 +0100
commit7f893a85d97d81e8bfdd4d10f32502708824e5ea (patch)
treed0e212487248a95becb7c6896aadce49e64c9dbf /src/kernels
parent69226ae8282d25c33fec5a0e5c6998da286aeb77 (diff)
Revert "Added options to disable parts of the invert kernel to find out where the AMD compiler crashes"
This reverts commit 407ed52cec41445f02e85cb45d08f590960216bb.
Diffstat (limited to 'src/kernels')
-rw-r--r--src/kernels/level3/invert_diagonal_blocks_part1.opencl33
-rw-r--r--src/kernels/level3/invert_diagonal_blocks_part2.opencl6
2 files changed, 5 insertions, 34 deletions
diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index c3d93dad..c1f96bd7 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -58,18 +58,6 @@ R"(
// =================================================================================================
#if defined(ROUTINE_INVERT)
-//#define DISABLE_PART1
-//#define DISABLE_PART1A
-//#define DISABLE_PART1B
-//#define DISABLE_PART2
-//#define DISABLE_PART3
-//#define DISABLE_PART4
-//#define DISABLE_PART5
-//#define DISABLE_PART6
-//#define DISABLE_PART7
-//#define DISABLE_PART8
-//#define DISABLE_PART9
-
// Parameters set by the tuner
// TODO: Make these actually tunable
#ifndef INTERNAL_BLOCK_SIZE
@@ -115,20 +103,16 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i
// Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
__local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
-#ifndef DISABLE_PART1
// Loads the source lower triangle into local memory. Any values in the upper triangle or
// outside of the matrix are set to zero
for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) {
bool condition = false;
-#ifndef DISABLE_PART1A
if (is_upper) {
condition = (thread_index <= _j) && (block_index_per_block + _j < n);
}
else {
condition = (thread_index >= _j) && (block_index_per_block + thread_index < n);
}
-#endif
-#ifndef DISABLE_PART1B
if (condition) {
const int src_index = _j*src_ld + thread_index + src_block_offset;
lm[thread_index][_j] = src[src_index];
@@ -136,11 +120,9 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i
else {
SetToZero(lm[thread_index][_j]);
}
-#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-#ifndef DISABLE_PART2
+
// Inverts the diagonal
real inverted_diagonal;
SetToOne(inverted_diagonal);
@@ -154,8 +136,7 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i
}
lm[thread_index][thread_index] = inverted_diagonal;
barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-#ifndef DISABLE_PART3
+
// Upper-triangular
if (is_upper) {
@@ -205,7 +186,6 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i
for (int j = 0; j < INTERNAL_BLOCK_SIZE; j += 1) {
dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
}
-#endif
}
// =================================================================================================
@@ -238,7 +218,6 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
for (int _j = 0; _j < 16; _j += 1) {
SetToZero(cpm[_j]);
}
-#ifndef DISABLE_PART4
// Computes NT x 16 block of C, each thread computes one 1 x 16 row
for (int k = 0; k < current_size; k += 16) {
@@ -283,8 +262,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
barrier(CLK_LOCAL_MEM_FENCE);
}
-#endif
-#ifndef DISABLE_PART5
+
// Stores NT x 16 results: each thread writes one 16 x 1 row
#pragma unroll
for (int _i = 0; _i < 16; _i += 1) {
@@ -292,7 +270,6 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
cgm[0] = cpm[_i];
cgm += ldc;
}
-#endif
}
// =================================================================================================
@@ -302,7 +279,6 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r
__global const real* src, const int a_offset, const int lda,
__global real* dest, int current_size, int num_pages, const int block_size) {
-#ifndef DISABLE_PART6
// Emulates a 3D grid: NX * (NY * num_pages)
const int page = get_group_id(1) % num_pages;
@@ -332,14 +308,12 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r
const int ldb = block_size;
const int ldc = block_size;
TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
-#endif
}
// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n,
__global real* dest, int current_size, int num_pages, const int block_size) {
-#ifndef DISABLE_PART7
// Emulates a 3D grid: NX * (NY * num_pages)
const int page = get_group_id(1) % num_pages;
@@ -371,7 +345,6 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r
const int ldb = block_size;
const int ldc = block_size;
TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
-#endif
}
#endif
diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
index 22d8e5d7..8736203c 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
@@ -18,7 +18,6 @@ R"(
// =================================================================================================
#if defined(ROUTINE_INVERT)
-#ifndef DISABLE_PART8
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
@@ -69,10 +68,9 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
}
-#endif
+
// =================================================================================================
-#ifndef DISABLE_PART9
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
@@ -123,7 +121,7 @@ void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_s
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
}
-#endif
+
#endif
// =================================================================================================