From e609220393654e5400b0acd0d362367f5fe28ab8 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 31 May 2018 20:09:49 +0200 Subject: Some potential fixes for error -54 when launching TRSV and TRSM kernels --- src/kernels/level2/xtrsv.opencl | 2 +- .../level3/invert_diagonal_blocks_part2.opencl | 24 +++++++++++----------- src/kernels/level3/level3.opencl | 2 +- src/routines/levelx/xinvert.cpp | 3 ++- 4 files changed, 16 insertions(+), 15 deletions(-) (limited to 'src') diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl index 8777eb77..e7b6ae79 100644 --- a/src/kernels/level2/xtrsv.opencl +++ b/src/kernels/level2/xtrsv.opencl @@ -18,7 +18,7 @@ R"( // ================================================================================================= #if defined(ROUTINE_TRSV) -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) +__kernel void FillVector(const int n, const int inc, const int offset, __global real* restrict dest, const real_arg arg_value) { const real value = GetRealArg(arg_value); diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl index 8736203c..8e9b583e 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl @@ -19,7 +19,7 @@ R"( #if defined(ROUTINE_INVERT) // B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -28,7 +28,7 @@ void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const in } // B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -36,7 +36,7 @@ void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_s } // B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -45,7 +45,7 @@ void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const in } // B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -53,7 +53,7 @@ void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_s } // B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -62,7 +62,7 @@ void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const in } // B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -72,7 +72,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s // ================================================================================================= // B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -81,7 +81,7 @@ void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const in } // B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -89,7 +89,7 @@ void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_s } // B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -98,7 +98,7 @@ void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const in } // B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -106,7 +106,7 @@ void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_s } // B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -115,7 +115,7 @@ void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const in } // B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) +__kernel void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl index c67851df..bea73daf 100644 --- a/src/kernels/level3/level3.opencl +++ b/src/kernels/level3/level3.opencl @@ -76,7 +76,7 @@ R"( // ================================================================================================= #if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM) -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) +__kernel void FillMatrix(const int m, const int n, const int ld, const int offset, __global real* restrict dest, const real_arg arg_value) { const real value = GetRealArg(arg_value); diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp index 65a28d73..09ef3ec1 100644 --- a/src/routines/levelx/xinvert.cpp +++ b/src/routines/levelx/xinvert.cpp @@ -113,7 +113,8 @@ void Xinvert::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle const auto npages = CeilDiv(n, current_size*2); const auto local0 = (current_size <= 32) ? current_size/4 : 16; const auto local = std::vector{local0, 4}; - const auto global = std::vector{(current_size/local[1]), npages*(current_size/16)*local[1]}; + const auto global = std::vector{Ceil(current_size/local[1], local[0]), + Ceil(npages*(current_size/16)*local[1], local[1])}; // Part 1 auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix); -- cgit v1.2.3