From 7f893a85d97d81e8bfdd4d10f32502708824e5ea Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 31 Dec 2017 16:10:40 +0100 Subject: Revert "Added options to disable parts of the invert kernel to find out where the AMD compiler crashes" This reverts commit 407ed52cec41445f02e85cb45d08f590960216bb. --- .../level3/invert_diagonal_blocks_part1.opencl | 33 ++-------------------- .../level3/invert_diagonal_blocks_part2.opencl | 6 ++-- 2 files changed, 5 insertions(+), 34 deletions(-) (limited to 'src/kernels/level3') diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl index c3d93dad..c1f96bd7 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl @@ -58,18 +58,6 @@ R"( // ================================================================================================= #if defined(ROUTINE_INVERT) -//#define DISABLE_PART1 -//#define DISABLE_PART1A -//#define DISABLE_PART1B -//#define DISABLE_PART2 -//#define DISABLE_PART3 -//#define DISABLE_PART4 -//#define DISABLE_PART5 -//#define DISABLE_PART6 -//#define DISABLE_PART7 -//#define DISABLE_PART8 -//#define DISABLE_PART9 - // Parameters set by the tuner // TODO: Make these actually tunable #ifndef INTERNAL_BLOCK_SIZE @@ -115,20 +103,16 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; -#ifndef DISABLE_PART1 // Loads the source lower triangle into local memory. Any values in the upper triangle or // outside of the matrix are set to zero for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) { bool condition = false; -#ifndef DISABLE_PART1A if (is_upper) { condition = (thread_index <= _j) && (block_index_per_block + _j < n); } else { condition = (thread_index >= _j) && (block_index_per_block + thread_index < n); } -#endif -#ifndef DISABLE_PART1B if (condition) { const int src_index = _j*src_ld + thread_index + src_block_offset; lm[thread_index][_j] = src[src_index]; @@ -136,11 +120,9 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i else { SetToZero(lm[thread_index][_j]); } -#endif } barrier(CLK_LOCAL_MEM_FENCE); -#endif -#ifndef DISABLE_PART2 + // Inverts the diagonal real inverted_diagonal; SetToOne(inverted_diagonal); @@ -154,8 +136,7 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i } lm[thread_index][thread_index] = inverted_diagonal; barrier(CLK_LOCAL_MEM_FENCE); -#endif -#ifndef DISABLE_PART3 + // Upper-triangular if (is_upper) { @@ -205,7 +186,6 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i for (int j = 0; j < INTERNAL_BLOCK_SIZE; j += 1) { dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j]; } -#endif } // ================================================================================================= @@ -238,7 +218,6 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, for (int _j = 0; _j < 16; _j += 1) { SetToZero(cpm[_j]); } -#ifndef DISABLE_PART4 // Computes NT x 16 block of C, each thread computes one 1 x 16 row for (int k = 0; k < current_size; k += 16) { @@ -283,8 +262,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, barrier(CLK_LOCAL_MEM_FENCE); } -#endif -#ifndef DISABLE_PART5 + // Stores NT x 16 results: each thread writes one 16 x 1 row #pragma unroll for (int _i = 0; _i < 16; _i += 1) { @@ -292,7 +270,6 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, cgm[0] = cpm[_i]; cgm += ldc; } -#endif } // ================================================================================================= @@ -302,7 +279,6 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r __global const real* src, const int a_offset, const int lda, __global real* dest, int current_size, int num_pages, const int block_size) { -#ifndef DISABLE_PART6 // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; @@ -332,14 +308,12 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r const int ldb = block_size; const int ldc = block_size; TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); -#endif } // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower) INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n, __global real* dest, int current_size, int num_pages, const int block_size) { -#ifndef DISABLE_PART7 // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; @@ -371,7 +345,6 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r const int ldb = block_size; const int ldc = block_size; TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); -#endif } #endif diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl index 22d8e5d7..8736203c 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl @@ -18,7 +18,6 @@ R"( // ================================================================================================= #if defined(ROUTINE_INVERT) -#ifndef DISABLE_PART8 // B21 = A21 * B11 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, @@ -69,10 +68,9 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size); } -#endif + // ================================================================================================= -#ifndef DISABLE_PART9 // B12 = A12 * B22 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, @@ -123,7 +121,7 @@ void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_s __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size); } -#endif + #endif // ================================================================================================= -- cgit v1.2.3