diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-12-19 21:10:48 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-12-19 21:10:48 +0100 |
commit | 07a7012b0dc4824f95020dadd4cb32ffc8a34f7a (patch) | |
tree | 95f824f63f6fe0ca2f8fad600e60ebfdac9be49e /src/kernels | |
parent | 249bdaa8e9a111573f5c3a821230bba6437817c7 (diff) |
Added skeleton for a tuner for the invert kernel
Diffstat (limited to 'src/kernels')
-rw-r--r-- | src/kernels/level3/invert_diagonal_blocks.opencl | 50 |
1 files changed, 34 insertions, 16 deletions
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl index db1513c1..e8f0ea91 100644 --- a/src/kernels/level3/invert_diagonal_blocks.opencl +++ b/src/kernels/level3/invert_diagonal_blocks.opencl @@ -56,8 +56,26 @@ R"( // ================================================================================================= #if defined(ROUTINE_INVERT) -#define LOCALX 17 // 16 + 1 to avoid bank conflicts -#define LOCALY 16 +// Parameters set by the tuner +// TODO: Make these actually tunable +#ifndef INTERNAL_BLOCK_SIZE + #define INTERNAL_BLOCK_SIZE 16 // Internal block size of the invert kernel +#endif +#ifndef LOCALPAD + #define LOCALPAD 0 // Padding in the x-dimension of the local memory to avoid bank conflicts +#endif +#ifndef LOCALX + #define LOCALX (16 + LOCALPAD) // Local memory size in x-dimension of TripleMatMul kernels +#endif +#ifndef LOCALY + #define LOCALY 16 // Local memory size in y-dimension of TripleMatMul kernels +#endif +#ifndef TMMWGSX + #define TMMWGSX 4 // Work-group size in x-dimension of TripleMatMul kernels +#endif +#ifndef TMMWGSY + #define TMMWGSY 4 // Work-group size in y-dimension of TripleMatMul kernels +#endif // ================================================================================================= @@ -172,7 +190,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, const int page = get_group_id(1) % num_pages; const int lidx = get_local_id(0); const int lidy = get_local_id(1); - const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); + const int ibx = get_group_id(0) * (get_local_size(0) * TMMWGSY); const int iby = by*16; const int id = lidx + lidy*get_local_size(0); const int row = page*current_size*2 + current_size + ibx + id; @@ -195,7 +213,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, // Loads a 16 x 16 block of B into local memory using NX x 4 threads for (int i = 0; i < 16; i += (size/4) ) { // += get_local_size(0) - for (int _j = 0; _j < 16; _j += 4 ) { // += get_local_size(1) + for (int _j = 0; _j < 16; _j += TMMWGSY ) { // += get_local_size(1) blm[(lidx + i) * LOCALX + (lidy + _j)] = bgm[k + i + _j*ldb]; } } @@ -321,7 +339,7 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r // ================================================================================================= // B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -330,7 +348,7 @@ void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const in } // B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -338,7 +356,7 @@ void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_s } // B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -347,7 +365,7 @@ void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const in } // B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -355,7 +373,7 @@ void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_s } // B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -364,7 +382,7 @@ void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const in } // B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -374,7 +392,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s // ================================================================================================= // B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -383,7 +401,7 @@ void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const in } // B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -391,7 +409,7 @@ void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_s } // B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -400,7 +418,7 @@ void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const in } // B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -408,7 +426,7 @@ void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_s } // B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -417,7 +435,7 @@ void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const in } // B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; |