summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-05-31 20:09:49 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2018-05-31 20:09:49 +0200
commite609220393654e5400b0acd0d362367f5fe28ab8 (patch)
treeabb4ed1edf5f483e4ecc89b68614b87c922e4af9
parentff4d5558a6d945592cc51760628a77237cd30f67 (diff)
Some potential fixes for error -54 when launching TRSV and TRSM kernels
-rw-r--r--src/kernels/level2/xtrsv.opencl2
-rw-r--r--src/kernels/level3/invert_diagonal_blocks_part2.opencl24
-rw-r--r--src/kernels/level3/level3.opencl2
-rw-r--r--src/routines/levelx/xinvert.cpp3
4 files changed, 16 insertions, 15 deletions
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
index 8777eb77..e7b6ae79 100644
--- a/src/kernels/level2/xtrsv.opencl
+++ b/src/kernels/level2/xtrsv.opencl
@@ -18,7 +18,7 @@ R"(
// =================================================================================================
#if defined(ROUTINE_TRSV)
-__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
+__kernel
void FillVector(const int n, const int inc, const int offset,
__global real* restrict dest, const real_arg arg_value) {
const real value = GetRealArg(arg_value);
diff --git a/src/kernels/level3/invert_diagonal_blocks_part2.opencl b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
index 8736203c..8e9b583e 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part2.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part2.opencl
@@ -19,7 +19,7 @@ R"(
#if defined(ROUTINE_INVERT)
// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -28,7 +28,7 @@ void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -36,7 +36,7 @@ void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_s
}
// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -45,7 +45,7 @@ void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -53,7 +53,7 @@ void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_s
}
// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -62,7 +62,7 @@ void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -72,7 +72,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
// =================================================================================================
// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -81,7 +81,7 @@ void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -89,7 +89,7 @@ void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_s
}
// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -98,7 +98,7 @@ void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -106,7 +106,7 @@ void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_s
}
// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -115,7 +115,7 @@ void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
+__kernel
void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl
index c67851df..bea73daf 100644
--- a/src/kernels/level3/level3.opencl
+++ b/src/kernels/level3/level3.opencl
@@ -76,7 +76,7 @@ R"(
// =================================================================================================
#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
-__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
+__kernel
void FillMatrix(const int m, const int n, const int ld, const int offset,
__global real* restrict dest, const real_arg arg_value) {
const real value = GetRealArg(arg_value);
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
index 65a28d73..09ef3ec1 100644
--- a/src/routines/levelx/xinvert.cpp
+++ b/src/routines/levelx/xinvert.cpp
@@ -113,7 +113,8 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
const auto npages = CeilDiv(n, current_size*2);
const auto local0 = (current_size <= 32) ? current_size/4 : 16;
const auto local = std::vector<size_t>{local0, 4};
- const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
+ const auto global = std::vector<size_t>{Ceil(current_size/local[1], local[0]),
+ Ceil(npages*(current_size/16)*local[1], local[1])};
// Part 1
auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);