From 407ed52cec41445f02e85cb45d08f590960216bb Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 30 Dec 2017 21:07:50 +0100
Subject: Added options to disable parts of the invert kernel to find out where
 the AMD compiler crashes

---
 .../level3/invert_diagonal_blocks_part1.opencl     | 27 +++++++++++++++++++---
 .../level3/invert_diagonal_blocks_part2.opencl     |  6 +++--
 2 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'src/kernels')

diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index 44b871bb..8c210c9e 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -58,6 +58,16 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_INVERT)
 
+//#define DISABLE_PART1
+//#define DISABLE_PART2
+//#define DISABLE_PART3
+//#define DISABLE_PART4
+//#define DISABLE_PART5
+//#define DISABLE_PART6
+//#define DISABLE_PART7
+//#define DISABLE_PART8
+//#define DISABLE_PART9
+
 // Parameters set by the tuner
 // TODO: Make these actually tunable
 #ifndef INTERNAL_BLOCK_SIZE
@@ -102,6 +112,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
   __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
 
+#ifdef DISABLE_PART1
   // Loads the source lower triangle into local memory. Any values in the upper triangle or
   // outside of the matrix are set to zero
   #pragma unroll
@@ -121,7 +132,8 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
     }
   }
   barrier(CLK_LOCAL_MEM_FENCE);
-  
+#endif
+#ifdef DISABLE_PART2
   // Inverts the diagonal
   real inverted_diagonal;
   SetToOne(inverted_diagonal);
@@ -135,7 +147,8 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   }
   lm[thread_index][thread_index] = inverted_diagonal;
   barrier(CLK_LOCAL_MEM_FENCE);
-
+#endif
+#ifdef DISABLE_PART3
   // Upper-triangular
   if (is_upper) {
 
@@ -185,6 +198,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   for (int j = 0; j < INTERNAL_BLOCK_SIZE; j += 1) {
     dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
   }
+#endif
 }
 
 // =================================================================================================
@@ -217,6 +231,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
   for (int _j = 0; _j < 16; _j += 1) {
     SetToZero(cpm[_j]);
   }
+#ifdef DISABLE_PART4
 
   // Computes NT x 16 block of C, each thread computes one 1 x 16 row
   for (int k = 0; k < current_size; k += 16) {
@@ -261,7 +276,8 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
 
     barrier(CLK_LOCAL_MEM_FENCE);
   }
-
+#endif
+#ifdef DISABLE_PART5
   // Stores NT x 16 results: each thread writes one 16 x 1 row
   #pragma unroll
   for (int _i = 0; _i < 16; _i += 1) {
@@ -269,6 +285,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
     cgm[0] = cpm[_i];
     cgm += ldc;
   }
+#endif
 }
 
 // =================================================================================================
@@ -278,6 +295,7 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r
                                    __global const real* src, const int a_offset, const int lda,
                                    __global real* dest, int current_size, int num_pages, const int block_size) {
 
+#ifdef DISABLE_PART6
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
 
@@ -307,12 +325,14 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r
   const int ldb = block_size;
   const int ldc = block_size;
   TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+#endif
 }
 
 // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
 INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n,
                                    __global real* dest, int current_size, int num_pages, const int block_size) {
 
+#ifdef DISABLE_PART7
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
 
@@ -344,6 +364,7 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r
   const int ldb = block_size;
   const int ldc = block_size;
   TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+#endif
 }
 
 #endif
diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
index 8736203c..37210f77 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
@@ -18,6 +18,7 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_INVERT)
 
+#ifdef DISABLE_PART8
 // B21 = A21 * B11
 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
@@ -68,9 +69,10 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
   __local real lm[LOCALY * LOCALX];
   TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
 }
-
+#endif
 // =================================================================================================
 
+#ifdef DISABLE_PART9
 // B12 =  A12 * B22
 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
@@ -121,7 +123,7 @@ void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_s
   __local real lm[LOCALY * LOCALX];
   TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
 }
-
+#endif
 #endif
 // =================================================================================================
 
-- 
cgit v1.2.3