diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-05-14 20:27:35 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-05-14 20:27:35 +0200 |
commit | 512b83dbad50cf04ea58ed6edc2def4fa6fc12ad (patch) | |
tree | 7e09dcd49ef8c90a621f4e7024af63feb36d0005 | |
parent | f151e56daa617e3327826f06f0765d1673fa8cfd (diff) |
Fixed a missing synchronization barrier in the invert kernel; fixes TRSM tests
-rw-r--r-- | src/kernels/level3/invert_diagonal_blocks.opencl | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl index 55f4a963..874c1510 100644 --- a/src/kernels/level3/invert_diagonal_blocks.opencl +++ b/src/kernels/level3/invert_diagonal_blocks.opencl @@ -113,13 +113,16 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // Computes the elements 0:j-1 of the j-th column for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) { + real sum; if (thread_index < j) { - real sum; SetToZero(sum); #pragma unroll for (int k = 0; k < j; ++k) { MultiplyAdd(sum, lm[thread_index][k], lm[k][j]); } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (thread_index < j) { real diagonal_value = lm[j][j]; Negate(diagonal_value); Multiply(lm[thread_index][j], diagonal_value, sum); @@ -133,13 +136,16 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) { + real sum; if (thread_index > j) { - real sum; SetToZero(sum); #pragma unroll for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) { MultiplyAdd(sum, lm[thread_index][k], lm[k][j]); } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (thread_index > j) { real diagonal_value = lm[j][j]; Negate(diagonal_value); Multiply(lm[thread_index][j], diagonal_value, sum); |