From 2b9bf3a9aa7f2879911303d158c32842760989ba Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 27 Dec 2017 17:03:06 +0100 Subject: Simplified invert kernel a little --- src/kernels/level3/invert_diagonal_blocks_part1.opencl | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl index 040fcc83..44b871bb 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl @@ -93,8 +93,11 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // Sets the offset for this particular block in the source and destination matrices const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset; const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE; - const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block, - (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that + const int block_index_div = block_index / num_inner_blocks; + const int block_index_mod = block_index % num_inner_blocks; + const int offset_part1 = block_index_div * outer_block_size * outer_block_size; // go to the block_index_div outer outer_block_size*outer_block_size block + const int offset_part2 = block_index_mod * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the block_index_mod inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that + const int dest_block_offset = offset_part1 + offset_part2; // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; @@ -103,8 +106,13 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // outside of the matrix are set to zero #pragma unroll for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) { - const bool condition = (is_upper) ? (thread_index <= _j && block_index*INTERNAL_BLOCK_SIZE + _j < n) : - (thread_index >= _j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n); + bool condition; + if (is_upper) { + condition = (thread_index <= _j) && (block_index*INTERNAL_BLOCK_SIZE + _j < n); + } + else { + condition = (thread_index >= _j) && (block_index*INTERNAL_BLOCK_SIZE + thread_index < n); + } if (condition) { lm[thread_index][_j] = src[_j*src_ld + thread_index + src_block_offset]; } -- cgit v1.2.3 From 0eb9b35481531d5ddc7e22371a44a12dc0e69c50 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 27 Dec 2017 17:16:08 +0100 Subject: Added a simple test to check compilation of the invert kernels (issue with AMD APP) --- CMakeLists.txt | 2 +- test/correctness/misc/compile_invert.cpp | 65 ++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 test/correctness/misc/compile_invert.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 53944b25..759f6d2e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -578,7 +578,7 @@ if(TESTS) endforeach() # Miscellaneous tests - set(MISC_TESTS override_parameters) + set(MISC_TESTS override_parameters compile_invert) if(NOT CUDA) set(MISC_TESTS ${MISC_TESTS} preprocessor) endif() diff --git a/test/correctness/misc/compile_invert.cpp b/test/correctness/misc/compile_invert.cpp new file mode 100644 index 00000000..4ce458d1 --- /dev/null +++ b/test/correctness/misc/compile_invert.cpp @@ -0,0 +1,65 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains a simple test to compile the invert kernel. +// +// ================================================================================================= + +#include +#include +#include + +#include "utilities/utilities.hpp" +#include "routines/levelx/xinvert.hpp" + +namespace clblast { +// ================================================================================================= + +template +size_t CompileInvertKernels(int argc, char *argv[], const bool silent) { + + // Retrieves the arguments + auto help = std::string{"Options given/available:\n"}; + auto arguments = RetrieveCommandLineArguments(argc, argv); + const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); + const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); + + // Prints the help message (command-line arguments) + if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); } + + // Initializes OpenCL + const auto platform = Platform(platform_id); + const auto device = Device(platform, device_id); + const auto context = Context(device); + auto queue = Queue(context, device); + + // Compiles the invert kernels + auto diagonal_invert_event = Event(); + auto inverter = Xinvert(queue, diagonal_invert_event.pointer()); + + // Report and return + printf("\n"); + printf(" 1 test(s) passed\n"); + printf(" 0 test(s) failed\n"); + printf("\n"); + return 0; +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::CompileInvertKernels(argc, argv, false); + errors += clblast::CompileInvertKernels(argc, argv, true); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= -- cgit v1.2.3 From 407ed52cec41445f02e85cb45d08f590960216bb Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 30 Dec 2017 21:07:50 +0100 Subject: Added options to disable parts of the invert kernel to find out where the AMD compiler crashes --- .../level3/invert_diagonal_blocks_part1.opencl | 27 +++++++++++++++++++--- .../level3/invert_diagonal_blocks_part2.opencl | 6 +++-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl index 44b871bb..8c210c9e 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl @@ -58,6 +58,16 @@ R"( // ================================================================================================= #if defined(ROUTINE_INVERT) +//#define DISABLE_PART1 +//#define DISABLE_PART2 +//#define DISABLE_PART3 +//#define DISABLE_PART4 +//#define DISABLE_PART5 +//#define DISABLE_PART6 +//#define DISABLE_PART7 +//#define DISABLE_PART8 +//#define DISABLE_PART9 + // Parameters set by the tuner // TODO: Make these actually tunable #ifndef INTERNAL_BLOCK_SIZE @@ -102,6 +112,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; +#ifdef DISABLE_PART1 // Loads the source lower triangle into local memory. Any values in the upper triangle or // outside of the matrix are set to zero #pragma unroll @@ -121,7 +132,8 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src } } barrier(CLK_LOCAL_MEM_FENCE); - +#endif +#ifdef DISABLE_PART2 // Inverts the diagonal real inverted_diagonal; SetToOne(inverted_diagonal); @@ -135,7 +147,8 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src } lm[thread_index][thread_index] = inverted_diagonal; barrier(CLK_LOCAL_MEM_FENCE); - +#endif +#ifdef DISABLE_PART3 // Upper-triangular if (is_upper) { @@ -185,6 +198,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src for (int j = 0; j < INTERNAL_BLOCK_SIZE; j += 1) { dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j]; } +#endif } // ================================================================================================= @@ -217,6 +231,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, for (int _j = 0; _j < 16; _j += 1) { SetToZero(cpm[_j]); } +#ifdef DISABLE_PART4 // Computes NT x 16 block of C, each thread computes one 1 x 16 row for (int k = 0; k < current_size; k += 16) { @@ -261,7 +276,8 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, barrier(CLK_LOCAL_MEM_FENCE); } - +#endif +#ifdef DISABLE_PART5 // Stores NT x 16 results: each thread writes one 16 x 1 row #pragma unroll for (int _i = 0; _i < 16; _i += 1) { @@ -269,6 +285,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, cgm[0] = cpm[_i]; cgm += ldc; } +#endif } // ================================================================================================= @@ -278,6 +295,7 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r __global const real* src, const int a_offset, const int lda, __global real* dest, int current_size, int num_pages, const int block_size) { +#ifdef DISABLE_PART6 // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; @@ -307,12 +325,14 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r const int ldb = block_size; const int ldc = block_size; TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); +#endif } // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower) INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n, __global real* dest, int current_size, int num_pages, const int block_size) { +#ifdef DISABLE_PART7 // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; @@ -344,6 +364,7 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r const int ldb = block_size; const int ldc = block_size; TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); +#endif } #endif diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl index 8736203c..37210f77 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl @@ -18,6 +18,7 @@ R"( // ================================================================================================= #if defined(ROUTINE_INVERT) +#ifdef DISABLE_PART8 // B21 = A21 * B11 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, @@ -68,9 +69,10 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size); } - +#endif // ================================================================================================= +#ifdef DISABLE_PART9 // B12 = A12 * B22 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, @@ -121,7 +123,7 @@ void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_s __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size); } - +#endif #endif // ================================================================================================= -- cgit v1.2.3 From 7ce415b9276e1d99f145741487f36a9034e5e035 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 30 Dec 2017 21:17:31 +0100 Subject: Fixed ifdef's into ifndef's --- src/kernels/level3/invert_diagonal_blocks_part1.opencl | 14 +++++++------- src/kernels/level3/invert_diagonal_blocks_part2.opencl | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl index 8c210c9e..8da019e9 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl @@ -112,7 +112,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; -#ifdef DISABLE_PART1 +#ifndef DISABLE_PART1 // Loads the source lower triangle into local memory. Any values in the upper triangle or // outside of the matrix are set to zero #pragma unroll @@ -133,7 +133,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src } barrier(CLK_LOCAL_MEM_FENCE); #endif -#ifdef DISABLE_PART2 +#ifndef DISABLE_PART2 // Inverts the diagonal real inverted_diagonal; SetToOne(inverted_diagonal); @@ -148,7 +148,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src lm[thread_index][thread_index] = inverted_diagonal; barrier(CLK_LOCAL_MEM_FENCE); #endif -#ifdef DISABLE_PART3 +#ifndef DISABLE_PART3 // Upper-triangular if (is_upper) { @@ -231,7 +231,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, for (int _j = 0; _j < 16; _j += 1) { SetToZero(cpm[_j]); } -#ifdef DISABLE_PART4 +#ifndef DISABLE_PART4 // Computes NT x 16 block of C, each thread computes one 1 x 16 row for (int k = 0; k < current_size; k += 16) { @@ -277,7 +277,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, barrier(CLK_LOCAL_MEM_FENCE); } #endif -#ifdef DISABLE_PART5 +#ifndef DISABLE_PART5 // Stores NT x 16 results: each thread writes one 16 x 1 row #pragma unroll for (int _i = 0; _i < 16; _i += 1) { @@ -295,7 +295,7 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r __global const real* src, const int a_offset, const int lda, __global real* dest, int current_size, int num_pages, const int block_size) { -#ifdef DISABLE_PART6 +#ifndef DISABLE_PART6 // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; @@ -332,7 +332,7 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n, __global real* dest, int current_size, int num_pages, const int block_size) { -#ifdef DISABLE_PART7 +#ifndef DISABLE_PART7 // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl index 37210f77..22d8e5d7 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl @@ -18,7 +18,7 @@ R"( // ================================================================================================= #if defined(ROUTINE_INVERT) -#ifdef DISABLE_PART8 +#ifndef DISABLE_PART8 // B21 = A21 * B11 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, @@ -72,7 +72,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s #endif // ================================================================================================= -#ifdef DISABLE_PART9 +#ifndef DISABLE_PART9 // B12 = A12 * B22 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, -- cgit v1.2.3 From 69226ae8282d25c33fec5a0e5c6998da286aeb77 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 31 Dec 2017 14:07:08 +0100 Subject: Changed the invert kernel slightly; added part1a/part1b disable-defines --- .../level3/invert_diagonal_blocks_part1.opencl | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl index 8da019e9..c3d93dad 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl @@ -59,6 +59,8 @@ R"( #if defined(ROUTINE_INVERT) //#define DISABLE_PART1 +//#define DISABLE_PART1A +//#define DISABLE_PART1B //#define DISABLE_PART2 //#define DISABLE_PART3 //#define DISABLE_PART4 @@ -93,7 +95,7 @@ R"( // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1))) -void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld, +void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld, __global real* restrict dest, const int outer_block_size, const int unit_diagonal, const int is_upper) { @@ -101,6 +103,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src const int block_index = get_group_id(0); // Sets the offset for this particular block in the source and destination matrices + const int block_index_per_block = block_index * INTERNAL_BLOCK_SIZE; const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset; const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE; const int block_index_div = block_index / num_inner_blocks; @@ -115,21 +118,25 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src #ifndef DISABLE_PART1 // Loads the source lower triangle into local memory. Any values in the upper triangle or // outside of the matrix are set to zero - #pragma unroll for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) { - bool condition; + bool condition = false; +#ifndef DISABLE_PART1A if (is_upper) { - condition = (thread_index <= _j) && (block_index*INTERNAL_BLOCK_SIZE + _j < n); + condition = (thread_index <= _j) && (block_index_per_block + _j < n); } else { - condition = (thread_index >= _j) && (block_index*INTERNAL_BLOCK_SIZE + thread_index < n); + condition = (thread_index >= _j) && (block_index_per_block + thread_index < n); } +#endif +#ifndef DISABLE_PART1B if (condition) { - lm[thread_index][_j] = src[_j*src_ld + thread_index + src_block_offset]; + const int src_index = _j*src_ld + thread_index + src_block_offset; + lm[thread_index][_j] = src[src_index]; } else { SetToZero(lm[thread_index][_j]); } +#endif } barrier(CLK_LOCAL_MEM_FENCE); #endif -- cgit v1.2.3 From 7f893a85d97d81e8bfdd4d10f32502708824e5ea Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 31 Dec 2017 16:10:40 +0100 Subject: Revert "Added options to disable parts of the invert kernel to find out where the AMD compiler crashes" This reverts commit 407ed52cec41445f02e85cb45d08f590960216bb. --- .../level3/invert_diagonal_blocks_part1.opencl | 33 ++-------------------- .../level3/invert_diagonal_blocks_part2.opencl | 6 ++-- 2 files changed, 5 insertions(+), 34 deletions(-) diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl index c3d93dad..c1f96bd7 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl @@ -58,18 +58,6 @@ R"( // ================================================================================================= #if defined(ROUTINE_INVERT) -//#define DISABLE_PART1 -//#define DISABLE_PART1A -//#define DISABLE_PART1B -//#define DISABLE_PART2 -//#define DISABLE_PART3 -//#define DISABLE_PART4 -//#define DISABLE_PART5 -//#define DISABLE_PART6 -//#define DISABLE_PART7 -//#define DISABLE_PART8 -//#define DISABLE_PART9 - // Parameters set by the tuner // TODO: Make these actually tunable #ifndef INTERNAL_BLOCK_SIZE @@ -115,20 +103,16 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; -#ifndef DISABLE_PART1 // Loads the source lower triangle into local memory. Any values in the upper triangle or // outside of the matrix are set to zero for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) { bool condition = false; -#ifndef DISABLE_PART1A if (is_upper) { condition = (thread_index <= _j) && (block_index_per_block + _j < n); } else { condition = (thread_index >= _j) && (block_index_per_block + thread_index < n); } -#endif -#ifndef DISABLE_PART1B if (condition) { const int src_index = _j*src_ld + thread_index + src_block_offset; lm[thread_index][_j] = src[src_index]; @@ -136,11 +120,9 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i else { SetToZero(lm[thread_index][_j]); } -#endif } barrier(CLK_LOCAL_MEM_FENCE); -#endif -#ifndef DISABLE_PART2 + // Inverts the diagonal real inverted_diagonal; SetToOne(inverted_diagonal); @@ -154,8 +136,7 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i } lm[thread_index][thread_index] = inverted_diagonal; barrier(CLK_LOCAL_MEM_FENCE); -#endif -#ifndef DISABLE_PART3 + // Upper-triangular if (is_upper) { @@ -205,7 +186,6 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i for (int j = 0; j < INTERNAL_BLOCK_SIZE; j += 1) { dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j]; } -#endif } // ================================================================================================= @@ -238,7 +218,6 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, for (int _j = 0; _j < 16; _j += 1) { SetToZero(cpm[_j]); } -#ifndef DISABLE_PART4 // Computes NT x 16 block of C, each thread computes one 1 x 16 row for (int k = 0; k < current_size; k += 16) { @@ -283,8 +262,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, barrier(CLK_LOCAL_MEM_FENCE); } -#endif -#ifndef DISABLE_PART5 + // Stores NT x 16 results: each thread writes one 16 x 1 row #pragma unroll for (int _i = 0; _i < 16; _i += 1) { @@ -292,7 +270,6 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, cgm[0] = cpm[_i]; cgm += ldc; } -#endif } // ================================================================================================= @@ -302,7 +279,6 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r __global const real* src, const int a_offset, const int lda, __global real* dest, int current_size, int num_pages, const int block_size) { -#ifndef DISABLE_PART6 // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; @@ -332,14 +308,12 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r const int ldb = block_size; const int ldc = block_size; TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); -#endif } // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower) INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n, __global real* dest, int current_size, int num_pages, const int block_size) { -#ifndef DISABLE_PART7 // Emulates a 3D grid: NX * (NY * num_pages) const int page = get_group_id(1) % num_pages; @@ -371,7 +345,6 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r const int ldb = block_size; const int ldc = block_size; TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); -#endif } #endif diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl index 22d8e5d7..8736203c 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl @@ -18,7 +18,6 @@ R"( // ================================================================================================= #if defined(ROUTINE_INVERT) -#ifndef DISABLE_PART8 // B21 = A21 * B11 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, @@ -69,10 +68,9 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size); } -#endif + // ================================================================================================= -#ifndef DISABLE_PART9 // B12 = A12 * B22 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, @@ -123,7 +121,7 @@ void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_s __local real lm[LOCALY * LOCALX]; TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size); } -#endif + #endif // ================================================================================================= -- cgit v1.2.3 From 1511909b6ffeb1cc1f3ee6b414c079e35a72a60d Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 31 Dec 2017 16:11:35 +0100 Subject: Revert "Added a simple test to check compilation of the invert kernels (issue with AMD APP)" This reverts commit 0eb9b35481531d5ddc7e22371a44a12dc0e69c50. --- CMakeLists.txt | 2 +- test/correctness/misc/compile_invert.cpp | 65 -------------------------------- 2 files changed, 1 insertion(+), 66 deletions(-) delete mode 100644 test/correctness/misc/compile_invert.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 759f6d2e..53944b25 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -578,7 +578,7 @@ if(TESTS) endforeach() # Miscellaneous tests - set(MISC_TESTS override_parameters compile_invert) + set(MISC_TESTS override_parameters) if(NOT CUDA) set(MISC_TESTS ${MISC_TESTS} preprocessor) endif() diff --git a/test/correctness/misc/compile_invert.cpp b/test/correctness/misc/compile_invert.cpp deleted file mode 100644 index 4ce458d1..00000000 --- a/test/correctness/misc/compile_invert.cpp +++ /dev/null @@ -1,65 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains a simple test to compile the invert kernel. -// -// ================================================================================================= - -#include -#include -#include - -#include "utilities/utilities.hpp" -#include "routines/levelx/xinvert.hpp" - -namespace clblast { -// ================================================================================================= - -template -size_t CompileInvertKernels(int argc, char *argv[], const bool silent) { - - // Retrieves the arguments - auto help = std::string{"Options given/available:\n"}; - auto arguments = RetrieveCommandLineArguments(argc, argv); - const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); - const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); - - // Prints the help message (command-line arguments) - if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); } - - // Initializes OpenCL - const auto platform = Platform(platform_id); - const auto device = Device(platform, device_id); - const auto context = Context(device); - auto queue = Queue(context, device); - - // Compiles the invert kernels - auto diagonal_invert_event = Event(); - auto inverter = Xinvert(queue, diagonal_invert_event.pointer()); - - // Report and return - printf("\n"); - printf(" 1 test(s) passed\n"); - printf(" 0 test(s) failed\n"); - printf("\n"); - return 0; -} - -// ================================================================================================= -} // namespace clblast - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - auto errors = size_t{0}; - errors += clblast::CompileInvertKernels(argc, argv, false); - errors += clblast::CompileInvertKernels(argc, argv, true); - if (errors > 0) { return 1; } else { return 0; } -} - -// ================================================================================================= -- cgit v1.2.3 From ad483123e6f7aab223417d8387baf74ae098a2a2 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 31 Dec 2017 16:13:13 +0100 Subject: Fixed the issue with AMD's APP compiler not being able to compile the invert kernel --- CHANGELOG | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG b/CHANGELOG index e2f0d872..d49cb3f5 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,6 +5,7 @@ Development (next version) - Added OpenCL pre-processor to unroll loops and perform array-to-register promotions for compilers which don't do this themselves (ARM Mali) - greatly improves performance on these platforms - Added first tuners for the TRSV (block size) and TRSM (invert kernel) routines +- Fixed an issue with a crashing/hanging AMD APP compiler with the TRSM routine (invert kernel) - Improved compilation time by splitting the tuning database into multiple compilation units - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) -- cgit v1.2.3