1 files changed, 34 insertions, 16 deletions
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index db1513c1..e8f0ea91 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -56,8 +56,26 @@ R"(
 // =================================================================================================
 #if defined(ROUTINE_INVERT)
 
-#define LOCALX 17 // 16 + 1 to avoid bank conflicts
-#define LOCALY 16
+// Parameters set by the tuner
+// TODO: Make these actually tunable
+#ifndef INTERNAL_BLOCK_SIZE
+  #define INTERNAL_BLOCK_SIZE 16     // Internal block size of the invert kernel
+#endif
+#ifndef LOCALPAD
+  #define LOCALPAD 0                 // Padding in the x-dimension of the local memory to avoid bank conflicts
+#endif
+#ifndef LOCALX
+  #define LOCALX (16 + LOCALPAD)     // Local memory size in x-dimension of TripleMatMul kernels
+#endif
+#ifndef LOCALY
+  #define LOCALY 16                  // Local memory size in y-dimension of TripleMatMul kernels
+#endif
+#ifndef TMMWGSX
+  #define TMMWGSX 4                  // Work-group size in x-dimension of TripleMatMul kernels
+#endif
+#ifndef TMMWGSY
+  #define TMMWGSY 4                  // Work-group size in y-dimension of TripleMatMul kernels
+#endif
 
 // =================================================================================================
 
@@ -172,7 +190,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
   const int page = get_group_id(1) % num_pages;
   const int lidx = get_local_id(0);
   const int lidy = get_local_id(1);
-  const int ibx  = get_group_id(0) * (get_local_size(0)*get_local_size(1));
+  const int ibx  = get_group_id(0) * (get_local_size(0) * TMMWGSY);
   const int iby  = by*16;
   const int id   = lidx + lidy*get_local_size(0);
   const int row  = page*current_size*2 + current_size + ibx + id;
@@ -195,7 +213,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
 
     // Loads a 16 x 16 block of B into local memory using NX x 4 threads
     for (int i = 0; i < 16; i += (size/4) ) {  // += get_local_size(0)
-      for (int _j = 0; _j < 16; _j += 4 ) {  // += get_local_size(1)
+      for (int _j = 0; _j < 16; _j += TMMWGSY ) {  // += get_local_size(1)
         blm[(lidx + i) * LOCALX + (lidy + _j)] = bgm[k + i + _j*ldb];
       }
     }
@@ -321,7 +339,7 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r
 // =================================================================================================
 
 // B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -330,7 +348,7 @@ void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const in
 }
 
 // B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];
@@ -338,7 +356,7 @@ void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_s
 }
 
 // B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -347,7 +365,7 @@ void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const in
 }
 
 // B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];
@@ -355,7 +373,7 @@ void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_s
 }
 
 // B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -364,7 +382,7 @@ void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const in
 }
 
 // B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];
@@ -374,7 +392,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
 // =================================================================================================
 
 // B12 =  A12 * B22
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -383,7 +401,7 @@ void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const in
 }
 
 // B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];
@@ -391,7 +409,7 @@ void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_s
 }
 
 // B12 =  A12 * B22
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -400,7 +418,7 @@ void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const in
 }
 
 // B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];
@@ -408,7 +426,7 @@ void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_s
 }
 
 // B12 =  A12 * B22
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
                               __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
@@ -417,7 +435,7 @@ void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const in
 }
 
 // B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
 void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
 {
   __local real lm[LOCALY * LOCALX];