summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-05-14 20:27:35 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2017-05-14 20:27:35 +0200
commit512b83dbad50cf04ea58ed6edc2def4fa6fc12ad (patch)
tree7e09dcd49ef8c90a621f4e7024af63feb36d0005 /src
parentf151e56daa617e3327826f06f0765d1673fa8cfd (diff)
Fixed a missing synchronization barrier in the invert kernel; fixes TRSM tests
Diffstat (limited to 'src')
-rw-r--r--src/kernels/level3/invert_diagonal_blocks.opencl10
1 files changed, 8 insertions, 2 deletions
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index 55f4a963..874c1510 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -113,13 +113,16 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
// Computes the elements 0:j-1 of the j-th column
for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) {
+ real sum;
if (thread_index < j) {
- real sum;
SetToZero(sum);
#pragma unroll
for (int k = 0; k < j; ++k) {
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
}
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (thread_index < j) {
real diagonal_value = lm[j][j];
Negate(diagonal_value);
Multiply(lm[thread_index][j], diagonal_value, sum);
@@ -133,13 +136,16 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
// Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column
for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) {
+ real sum;
if (thread_index > j) {
- real sum;
SetToZero(sum);
#pragma unroll
for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
}
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (thread_index > j) {
real diagonal_value = lm[j][j];
Negate(diagonal_value);
Multiply(lm[thread_index][j], diagonal_value, sum);