From 2b9bf3a9aa7f2879911303d158c32842760989ba Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 27 Dec 2017 17:03:06 +0100
Subject: Simplified invert kernel a little

---
 src/kernels/level3/invert_diagonal_blocks_part1.opencl | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index 040fcc83..44b871bb 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -93,8 +93,11 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   // Sets the offset for this particular block in the source and destination matrices
   const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
   const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
-  const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
-                                (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
+  const int block_index_div = block_index / num_inner_blocks;
+  const int block_index_mod = block_index % num_inner_blocks;
+  const int offset_part1 = block_index_div * outer_block_size * outer_block_size; // go to the block_index_div outer outer_block_size*outer_block_size block
+  const int offset_part2 = block_index_mod * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the block_index_mod inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
+  const int dest_block_offset = offset_part1 + offset_part2;
 
   // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
   __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
@@ -103,8 +106,13 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   // outside of the matrix are set to zero
   #pragma unroll
   for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) {
-    const bool condition = (is_upper) ? (thread_index <= _j && block_index*INTERNAL_BLOCK_SIZE + _j < n) :
-                                        (thread_index >= _j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+    bool condition;
+    if (is_upper) {
+      condition = (thread_index <= _j) && (block_index*INTERNAL_BLOCK_SIZE + _j < n);
+    }
+    else {
+      condition = (thread_index >= _j) && (block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+    }
     if (condition) {
       lm[thread_index][_j] = src[_j*src_ld + thread_index + src_block_offset];
     }
-- 
cgit v1.2.3


From 0eb9b35481531d5ddc7e22371a44a12dc0e69c50 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 27 Dec 2017 17:16:08 +0100
Subject: Added a simple test to check compilation of the invert kernels (issue
 with AMD APP)

---
 CMakeLists.txt                           |  2 +-
 test/correctness/misc/compile_invert.cpp | 65 ++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 test/correctness/misc/compile_invert.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53944b25..759f6d2e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -578,7 +578,7 @@ if(TESTS)
   endforeach()
 
   # Miscellaneous tests
-  set(MISC_TESTS override_parameters)
+  set(MISC_TESTS override_parameters compile_invert)
   if(NOT CUDA)
     set(MISC_TESTS ${MISC_TESTS} preprocessor)
   endif()
diff --git a/test/correctness/misc/compile_invert.cpp b/test/correctness/misc/compile_invert.cpp
new file mode 100644
index 00000000..4ce458d1
--- /dev/null
+++ b/test/correctness/misc/compile_invert.cpp
@@ -0,0 +1,65 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains a simple test to compile the invert kernel.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <cstdio>
+
+#include "utilities/utilities.hpp"
+#include "routines/levelx/xinvert.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+size_t CompileInvertKernels(int argc, char *argv[], const bool silent) {
+
+  // Retrieves the arguments
+  auto help = std::string{"Options given/available:\n"};
+  auto arguments = RetrieveCommandLineArguments(argc, argv);
+  const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}));
+  const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
+
+  // Prints the help message (command-line arguments)
+  if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); }
+
+  // Initializes OpenCL
+  const auto platform = Platform(platform_id);
+  const auto device = Device(platform, device_id);
+  const auto context = Context(device);
+  auto queue = Queue(context, device);
+
+  // Compiles the invert kernels
+  auto diagonal_invert_event = Event();
+  auto inverter = Xinvert<T>(queue, diagonal_invert_event.pointer());
+
+  // Report and return
+  printf("\n");
+  printf("    1 test(s) passed\n");
+  printf("    0 test(s) failed\n");
+  printf("\n");
+  return 0;
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::CompileInvertKernels<float>(argc, argv, false);
+  errors += clblast::CompileInvertKernels<clblast::float2>(argc, argv, true);
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
-- 
cgit v1.2.3


From 407ed52cec41445f02e85cb45d08f590960216bb Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 30 Dec 2017 21:07:50 +0100
Subject: Added options to disable parts of the invert kernel to find out where
 the AMD compiler crashes

---
 .../level3/invert_diagonal_blocks_part1.opencl     | 27 +++++++++++++++++++---
 .../level3/invert_diagonal_blocks_part2.opencl     |  6 +++--
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index 44b871bb..8c210c9e 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -58,6 +58,16 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_INVERT)
 
+//#define DISABLE_PART1
+//#define DISABLE_PART2
+//#define DISABLE_PART3
+//#define DISABLE_PART4
+//#define DISABLE_PART5
+//#define DISABLE_PART6
+//#define DISABLE_PART7
+//#define DISABLE_PART8
+//#define DISABLE_PART9
+
 // Parameters set by the tuner
 // TODO: Make these actually tunable
 #ifndef INTERNAL_BLOCK_SIZE
@@ -102,6 +112,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
   __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
 
+#ifdef DISABLE_PART1
   // Loads the source lower triangle into local memory. Any values in the upper triangle or
   // outside of the matrix are set to zero
   #pragma unroll
@@ -121,7 +132,8 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
     }
   }
   barrier(CLK_LOCAL_MEM_FENCE);
-  
+#endif
+#ifdef DISABLE_PART2
   // Inverts the diagonal
   real inverted_diagonal;
   SetToOne(inverted_diagonal);
@@ -135,7 +147,8 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   }
   lm[thread_index][thread_index] = inverted_diagonal;
   barrier(CLK_LOCAL_MEM_FENCE);
-
+#endif
+#ifdef DISABLE_PART3
   // Upper-triangular
   if (is_upper) {
 
@@ -185,6 +198,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   for (int j = 0; j < INTERNAL_BLOCK_SIZE; j += 1) {
     dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
   }
+#endif
 }
 
 // =================================================================================================
@@ -217,6 +231,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
   for (int _j = 0; _j < 16; _j += 1) {
     SetToZero(cpm[_j]);
   }
+#ifdef DISABLE_PART4
 
   // Computes NT x 16 block of C, each thread computes one 1 x 16 row
   for (int k = 0; k < current_size; k += 16) {
@@ -261,7 +276,8 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
 
     barrier(CLK_LOCAL_MEM_FENCE);
   }
-
+#endif
+#ifdef DISABLE_PART5
   // Stores NT x 16 results: each thread writes one 16 x 1 row
   #pragma unroll
   for (int _i = 0; _i < 16; _i += 1) {
@@ -269,6 +285,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
     cgm[0] = cpm[_i];
     cgm += ldc;
   }
+#endif
 }
 
 // =================================================================================================
@@ -278,6 +295,7 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r
                                    __global const real* src, const int a_offset, const int lda,
                                    __global real* dest, int current_size, int num_pages, const int block_size) {
 
+#ifdef DISABLE_PART6
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
 
@@ -307,12 +325,14 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r
   const int ldb = block_size;
   const int ldc = block_size;
   TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+#endif
 }
 
 // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
 INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n,
                                    __global real* dest, int current_size, int num_pages, const int block_size) {
 
+#ifdef DISABLE_PART7
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
 
@@ -344,6 +364,7 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r
   const int ldb = block_size;
   const int ldc = block_size;
   TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+#endif
 }
 
 #endif
diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
index 8736203c..37210f77 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
@@ -18,6 +18,7 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_INVERT)
 
+#ifdef DISABLE_PART8
 // B21 = A21 * B11
 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
@@ -68,9 +69,10 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
   __local real lm[LOCALY * LOCALX];
   TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
 }
-
+#endif
 // =================================================================================================
 
+#ifdef DISABLE_PART9
 // B12 =  A12 * B22
 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
@@ -121,7 +123,7 @@ void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_s
   __local real lm[LOCALY * LOCALX];
   TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
 }
-
+#endif
 #endif
 // =================================================================================================
 
-- 
cgit v1.2.3


From 7ce415b9276e1d99f145741487f36a9034e5e035 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 30 Dec 2017 21:17:31 +0100
Subject: Fixed ifdef's into ifndef's

---
 src/kernels/level3/invert_diagonal_blocks_part1.opencl | 14 +++++++-------
 src/kernels/level3/invert_diagonal_blocks_part2.opencl |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index 8c210c9e..8da019e9 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -112,7 +112,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
   __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
 
-#ifdef DISABLE_PART1
+#ifndef DISABLE_PART1
   // Loads the source lower triangle into local memory. Any values in the upper triangle or
   // outside of the matrix are set to zero
   #pragma unroll
@@ -133,7 +133,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   }
   barrier(CLK_LOCAL_MEM_FENCE);
 #endif
-#ifdef DISABLE_PART2
+#ifndef DISABLE_PART2
   // Inverts the diagonal
   real inverted_diagonal;
   SetToOne(inverted_diagonal);
@@ -148,7 +148,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   lm[thread_index][thread_index] = inverted_diagonal;
   barrier(CLK_LOCAL_MEM_FENCE);
 #endif
-#ifdef DISABLE_PART3
+#ifndef DISABLE_PART3
   // Upper-triangular
   if (is_upper) {
 
@@ -231,7 +231,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
   for (int _j = 0; _j < 16; _j += 1) {
     SetToZero(cpm[_j]);
   }
-#ifdef DISABLE_PART4
+#ifndef DISABLE_PART4
 
   // Computes NT x 16 block of C, each thread computes one 1 x 16 row
   for (int k = 0; k < current_size; k += 16) {
@@ -277,7 +277,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
     barrier(CLK_LOCAL_MEM_FENCE);
   }
 #endif
-#ifdef DISABLE_PART5
+#ifndef DISABLE_PART5
   // Stores NT x 16 results: each thread writes one 16 x 1 row
   #pragma unroll
   for (int _i = 0; _i < 16; _i += 1) {
@@ -295,7 +295,7 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r
                                    __global const real* src, const int a_offset, const int lda,
                                    __global real* dest, int current_size, int num_pages, const int block_size) {
 
-#ifdef DISABLE_PART6
+#ifndef DISABLE_PART6
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
 
@@ -332,7 +332,7 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r
 INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n,
                                    __global real* dest, int current_size, int num_pages, const int block_size) {
 
-#ifdef DISABLE_PART7
+#ifndef DISABLE_PART7
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
 
diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
index 37210f77..22d8e5d7 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
@@ -18,7 +18,7 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_INVERT)
 
-#ifdef DISABLE_PART8
+#ifndef DISABLE_PART8
 // B21 = A21 * B11
 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
@@ -72,7 +72,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
 #endif
 // =================================================================================================
 
-#ifdef DISABLE_PART9
+#ifndef DISABLE_PART9
 // B12 =  A12 * B22
 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
-- 
cgit v1.2.3


From 69226ae8282d25c33fec5a0e5c6998da286aeb77 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 31 Dec 2017 14:07:08 +0100
Subject: Changed the invert kernel slightly; added part1a/part1b
 disable-defines

---
 .../level3/invert_diagonal_blocks_part1.opencl        | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index 8da019e9..c3d93dad 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -59,6 +59,8 @@ R"(
 #if defined(ROUTINE_INVERT)
 
 //#define DISABLE_PART1
+//#define DISABLE_PART1A
+//#define DISABLE_PART1B
 //#define DISABLE_PART2
 //#define DISABLE_PART3
 //#define DISABLE_PART4
@@ -93,7 +95,7 @@ R"(
 
 // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
 __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
-void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
+void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld,
                          __global real* restrict dest, const int outer_block_size,
                          const int unit_diagonal, const int is_upper)
 {
@@ -101,6 +103,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   const int block_index = get_group_id(0);
 
   // Sets the offset for this particular block in the source and destination matrices
+  const int block_index_per_block = block_index * INTERNAL_BLOCK_SIZE;
   const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
   const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
   const int block_index_div = block_index / num_inner_blocks;
@@ -115,21 +118,25 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
 #ifndef DISABLE_PART1
   // Loads the source lower triangle into local memory. Any values in the upper triangle or
   // outside of the matrix are set to zero
-  #pragma unroll
   for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) {
-    bool condition;
+    bool condition = false;
+#ifndef DISABLE_PART1A
     if (is_upper) {
-      condition = (thread_index <= _j) && (block_index*INTERNAL_BLOCK_SIZE + _j < n);
+      condition = (thread_index <= _j) && (block_index_per_block + _j < n);
     }
     else {
-      condition = (thread_index >= _j) && (block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+      condition = (thread_index >= _j) && (block_index_per_block + thread_index < n);
     }
+#endif
+#ifndef DISABLE_PART1B
     if (condition) {
-      lm[thread_index][_j] = src[_j*src_ld + thread_index + src_block_offset];
+      const int src_index = _j*src_ld + thread_index + src_block_offset;
+      lm[thread_index][_j] = src[src_index];
     }
     else {
       SetToZero(lm[thread_index][_j]);
     }
+#endif
   }
   barrier(CLK_LOCAL_MEM_FENCE);
 #endif
-- 
cgit v1.2.3


From 7f893a85d97d81e8bfdd4d10f32502708824e5ea Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 31 Dec 2017 16:10:40 +0100
Subject: Revert "Added options to disable parts of the invert kernel to find
 out where the AMD compiler crashes"

This reverts commit 407ed52cec41445f02e85cb45d08f590960216bb.
---
 .../level3/invert_diagonal_blocks_part1.opencl     | 33 ++--------------------
 .../level3/invert_diagonal_blocks_part2.opencl     |  6 ++--
 2 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index c3d93dad..c1f96bd7 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -58,18 +58,6 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_INVERT)
 
-//#define DISABLE_PART1
-//#define DISABLE_PART1A
-//#define DISABLE_PART1B
-//#define DISABLE_PART2
-//#define DISABLE_PART3
-//#define DISABLE_PART4
-//#define DISABLE_PART5
-//#define DISABLE_PART6
-//#define DISABLE_PART7
-//#define DISABLE_PART8
-//#define DISABLE_PART9
-
 // Parameters set by the tuner
 // TODO: Make these actually tunable
 #ifndef INTERNAL_BLOCK_SIZE
@@ -115,20 +103,16 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i
   // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
   __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
 
-#ifndef DISABLE_PART1
   // Loads the source lower triangle into local memory. Any values in the upper triangle or
   // outside of the matrix are set to zero
   for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) {
     bool condition = false;
-#ifndef DISABLE_PART1A
     if (is_upper) {
       condition = (thread_index <= _j) && (block_index_per_block + _j < n);
     }
     else {
       condition = (thread_index >= _j) && (block_index_per_block + thread_index < n);
     }
-#endif
-#ifndef DISABLE_PART1B
     if (condition) {
       const int src_index = _j*src_ld + thread_index + src_block_offset;
       lm[thread_index][_j] = src[src_index];
@@ -136,11 +120,9 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i
     else {
       SetToZero(lm[thread_index][_j]);
     }
-#endif
   }
   barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-#ifndef DISABLE_PART2
+
   // Inverts the diagonal
   real inverted_diagonal;
   SetToOne(inverted_diagonal);
@@ -154,8 +136,7 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i
   }
   lm[thread_index][thread_index] = inverted_diagonal;
   barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-#ifndef DISABLE_PART3
+
   // Upper-triangular
   if (is_upper) {
 
@@ -205,7 +186,6 @@ void InvertDiagonalBlock(const int n, __global const real* restrict src, const i
   for (int j = 0; j < INTERNAL_BLOCK_SIZE; j += 1) {
     dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
   }
-#endif
 }
 
 // =================================================================================================
@@ -238,7 +218,6 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
   for (int _j = 0; _j < 16; _j += 1) {
     SetToZero(cpm[_j]);
   }
-#ifndef DISABLE_PART4
 
   // Computes NT x 16 block of C, each thread computes one 1 x 16 row
   for (int k = 0; k < current_size; k += 16) {
@@ -283,8 +262,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
 
     barrier(CLK_LOCAL_MEM_FENCE);
   }
-#endif
-#ifndef DISABLE_PART5
+
   // Stores NT x 16 results: each thread writes one 16 x 1 row
   #pragma unroll
   for (int _i = 0; _i < 16; _i += 1) {
@@ -292,7 +270,6 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
     cgm[0] = cpm[_i];
     cgm += ldc;
   }
-#endif
 }
 
 // =================================================================================================
@@ -302,7 +279,6 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r
                                    __global const real* src, const int a_offset, const int lda,
                                    __global real* dest, int current_size, int num_pages, const int block_size) {
 
-#ifndef DISABLE_PART6
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
 
@@ -332,14 +308,12 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR r
   const int ldb = block_size;
   const int ldc = block_size;
   TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
-#endif
 }
 
 // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
 INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n,
                                    __global real* dest, int current_size, int num_pages, const int block_size) {
 
-#ifndef DISABLE_PART7
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
 
@@ -371,7 +345,6 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r
   const int ldb = block_size;
   const int ldc = block_size;
   TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
-#endif
 }
 
 #endif
diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
index 22d8e5d7..8736203c 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
@@ -18,7 +18,6 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_INVERT)
 
-#ifndef DISABLE_PART8
 // B21 = A21 * B11
 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
@@ -69,10 +68,9 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
   __local real lm[LOCALY * LOCALX];
   TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
 }
-#endif
+
 // =================================================================================================
 
-#ifndef DISABLE_PART9
 // B12 =  A12 * B22
 __kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
@@ -123,7 +121,7 @@ void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_s
   __local real lm[LOCALY * LOCALX];
   TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
 }
-#endif
+
 #endif
 // =================================================================================================
 
-- 
cgit v1.2.3


From 1511909b6ffeb1cc1f3ee6b414c079e35a72a60d Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 31 Dec 2017 16:11:35 +0100
Subject: Revert "Added a simple test to check compilation of the invert
 kernels (issue with AMD APP)"

This reverts commit 0eb9b35481531d5ddc7e22371a44a12dc0e69c50.
---
 CMakeLists.txt                           |  2 +-
 test/correctness/misc/compile_invert.cpp | 65 --------------------------------
 2 files changed, 1 insertion(+), 66 deletions(-)
 delete mode 100644 test/correctness/misc/compile_invert.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 759f6d2e..53944b25 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -578,7 +578,7 @@ if(TESTS)
   endforeach()
 
   # Miscellaneous tests
-  set(MISC_TESTS override_parameters compile_invert)
+  set(MISC_TESTS override_parameters)
   if(NOT CUDA)
     set(MISC_TESTS ${MISC_TESTS} preprocessor)
   endif()
diff --git a/test/correctness/misc/compile_invert.cpp b/test/correctness/misc/compile_invert.cpp
deleted file mode 100644
index 4ce458d1..00000000
--- a/test/correctness/misc/compile_invert.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains a simple test to compile the invert kernel.
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-#include <cstdio>
-
-#include "utilities/utilities.hpp"
-#include "routines/levelx/xinvert.hpp"
-
-namespace clblast {
-// =================================================================================================
-
-template <typename T>
-size_t CompileInvertKernels(int argc, char *argv[], const bool silent) {
-
-  // Retrieves the arguments
-  auto help = std::string{"Options given/available:\n"};
-  auto arguments = RetrieveCommandLineArguments(argc, argv);
-  const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}));
-  const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
-
-  // Prints the help message (command-line arguments)
-  if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); }
-
-  // Initializes OpenCL
-  const auto platform = Platform(platform_id);
-  const auto device = Device(platform, device_id);
-  const auto context = Context(device);
-  auto queue = Queue(context, device);
-
-  // Compiles the invert kernels
-  auto diagonal_invert_event = Event();
-  auto inverter = Xinvert<T>(queue, diagonal_invert_event.pointer());
-
-  // Report and return
-  printf("\n");
-  printf("    1 test(s) passed\n");
-  printf("    0 test(s) failed\n");
-  printf("\n");
-  return 0;
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  auto errors = size_t{0};
-  errors += clblast::CompileInvertKernels<float>(argc, argv, false);
-  errors += clblast::CompileInvertKernels<clblast::float2>(argc, argv, true);
-  if (errors > 0) { return 1; } else { return 0; }
-}
-
-// =================================================================================================
-- 
cgit v1.2.3


From ad483123e6f7aab223417d8387baf74ae098a2a2 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 31 Dec 2017 16:13:13 +0100
Subject: Fixed the issue with AMD's APP compiler not being able to compile the
 invert kernel

---
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG b/CHANGELOG
index e2f0d872..d49cb3f5 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -5,6 +5,7 @@ Development (next version)
 - Added OpenCL pre-processor to unroll loops and perform array-to-register promotions for compilers
   which don't do this themselves (ARM Mali) - greatly improves performance on these platforms
 - Added first tuners for the TRSV (block size) and TRSM (invert kernel) routines
+- Fixed an issue with a crashing/hanging AMD APP compiler with the TRSM routine (invert kernel)
 - Improved compilation time by splitting the tuning database into multiple compilation units
 - Various minor fixes and enhancements
 - Added tuned parameters for various devices (see README)
-- 
cgit v1.2.3