summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-01-01 16:10:11 +0100
committerGitHub <noreply@github.com>2018-01-01 16:10:11 +0100
commit8040a4e355bdf6531eb9c4c5ae1fe4f792899d24 (patch)
treeb95bb54e9b7722b786f22b97eb157e9093d0fc0a
parentb4c8e1d9a5804358d6ae350111c85405c3183807 (diff)
parentad483123e6f7aab223417d8387baf74ae098a2a2 (diff)
Merge pull request #236 from CNugteren/trsm_compilation
Fixed compilation of TRSM/Invert for AMD APP
-rw-r--r--CHANGELOG1
-rw-r--r--src/kernels/level3/invert_diagonal_blocks_part1.opencl25
2 files changed, 18 insertions, 8 deletions
diff --git a/CHANGELOG b/CHANGELOG
index e2f0d872..d49cb3f5 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -5,6 +5,7 @@ Development (next version)
- Added OpenCL pre-processor to unroll loops and perform array-to-register promotions for compilers
which don't do this themselves (ARM Mali) - greatly improves performance on these platforms
- Added first tuners for the TRSV (block size) and TRSM (invert kernel) routines
+- Fixed an issue with a crashing/hanging AMD APP compiler with the TRSM routine (invert kernel)
- Improved compilation time by splitting the tuning database into multiple compilation units
- Various minor fixes and enhancements
- Added tuned parameters for various devices (see README)
diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index 040fcc83..c1f96bd7 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -83,7 +83,7 @@ R"(
// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
-void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
+void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld,
__global real* restrict dest, const int outer_block_size,
const int unit_diagonal, const int is_upper)
{
@@ -91,29 +91,38 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
const int block_index = get_group_id(0);
// Sets the offset for this particular block in the source and destination matrices
+ const int block_index_per_block = block_index * INTERNAL_BLOCK_SIZE;
const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
- const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
- (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
+ const int block_index_div = block_index / num_inner_blocks;
+ const int block_index_mod = block_index % num_inner_blocks;
+ const int offset_part1 = block_index_div * outer_block_size * outer_block_size; // go to the block_index_div outer outer_block_size*outer_block_size block
+ const int offset_part2 = block_index_mod * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the block_index_mod inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
+ const int dest_block_offset = offset_part1 + offset_part2;
// Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
__local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
// Loads the source lower triangle into local memory. Any values in the upper triangle or
// outside of the matrix are set to zero
- #pragma unroll
for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) {
- const bool condition = (is_upper) ? (thread_index <= _j && block_index*INTERNAL_BLOCK_SIZE + _j < n) :
- (thread_index >= _j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+ bool condition = false;
+ if (is_upper) {
+ condition = (thread_index <= _j) && (block_index_per_block + _j < n);
+ }
+ else {
+ condition = (thread_index >= _j) && (block_index_per_block + thread_index < n);
+ }
if (condition) {
- lm[thread_index][_j] = src[_j*src_ld + thread_index + src_block_offset];
+ const int src_index = _j*src_ld + thread_index + src_block_offset;
+ lm[thread_index][_j] = src[src_index];
}
else {
SetToZero(lm[thread_index][_j]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
-
+
// Inverts the diagonal
real inverted_diagonal;
SetToOne(inverted_diagonal);