summaryrefslogtreecommitdiff
path: root/src/kernels/level3/invert_diagonal_blocks.opencl
diff options
context:
space:
mode:
Diffstat (limited to 'src/kernels/level3/invert_diagonal_blocks.opencl')
-rw-r--r--src/kernels/level3/invert_diagonal_blocks.opencl50
1 files changed, 34 insertions, 16 deletions
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index db1513c1..e8f0ea91 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -56,8 +56,26 @@ R"(
// =================================================================================================
#if defined(ROUTINE_INVERT)
-#define LOCALX 17 // 16 + 1 to avoid bank conflicts
-#define LOCALY 16
+// Parameters set by the tuner
+// TODO: Make these actually tunable
+#ifndef INTERNAL_BLOCK_SIZE
+ #define INTERNAL_BLOCK_SIZE 16 // Internal block size of the invert kernel
+#endif
+#ifndef LOCALPAD
+ #define LOCALPAD 0 // Padding in the x-dimension of the local memory to avoid bank conflicts
+#endif
+#ifndef LOCALX
+ #define LOCALX (16 + LOCALPAD) // Local memory size in x-dimension of TripleMatMul kernels
+#endif
+#ifndef LOCALY
+ #define LOCALY 16 // Local memory size in y-dimension of TripleMatMul kernels
+#endif
+#ifndef TMMWGSX
+ #define TMMWGSX 4 // Work-group size in x-dimension of TripleMatMul kernels
+#endif
+#ifndef TMMWGSY
+ #define TMMWGSY 4 // Work-group size in y-dimension of TripleMatMul kernels
+#endif
// =================================================================================================
@@ -172,7 +190,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
const int page = get_group_id(1) % num_pages;
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
- const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1));
+ const int ibx = get_group_id(0) * (get_local_size(0) * TMMWGSY);
const int iby = by*16;
const int id = lidx + lidy*get_local_size(0);
const int row = page*current_size*2 + current_size + ibx + id;
@@ -195,7 +213,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
// Loads a 16 x 16 block of B into local memory using NX x 4 threads
for (int i = 0; i < 16; i += (size/4) ) { // += get_local_size(0)
- for (int _j = 0; _j < 16; _j += 4 ) { // += get_local_size(1)
+ for (int _j = 0; _j < 16; _j += TMMWGSY ) { // += get_local_size(1)
blm[(lidx + i) * LOCALX + (lidy + _j)] = bgm[k + i + _j*ldb];
}
}
@@ -321,7 +339,7 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r
// =================================================================================================
// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -330,7 +348,7 @@ void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -338,7 +356,7 @@ void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_s
}
// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -347,7 +365,7 @@ void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -355,7 +373,7 @@ void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_s
}
// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -364,7 +382,7 @@ void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -374,7 +392,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
// =================================================================================================
// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -383,7 +401,7 @@ void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -391,7 +409,7 @@ void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_s
}
// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -400,7 +418,7 @@ void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -408,7 +426,7 @@ void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_s
}
// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -417,7 +435,7 @@ void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];