From fa6e5e67f585b77d34c3031c176de9a0f7904aa9 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 27 Oct 2017 22:12:30 +0200 Subject: Fixed a bug when using the matrix A-offset argument for the TRSM routine --- CHANGELOG | 1 + src/routines/level3/xtrsm.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 4d1bb764..14a6dd22 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ Development (next version) - Fixed a bug in the TRSM/TRSV routines due to missing synchronisations after GEMM/GEMV calls +- Fixed a bug in TRSM when using the a-offset argument - Added a CUDA API to CLBlast: * The library and kernels can be compiled with the CUDA driver API and NVRTC (requires CUDA 7.5) * Two CUDA API sample programs are added: SGEMM and DAXPY diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp index 119bf25d..d622e3bf 100644 --- a/src/routines/level3/xtrsm.cpp +++ b/src/routines/level3/xtrsm.cpp @@ -143,7 +143,7 @@ void Xtrsm::TrsmColMajor(const Side side, const Triangle triangle, auto gemm2 = Xgemm(queue_, gemm2_event.pointer()); gemm2.DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, m - i - block_size, n, block_size, ConstantNegOne(), - a_buffer, this_a_offset, a_ld, + a_buffer, this_a_offset + a_offset, a_ld, x_buffer, x_offset + i, x_ld, gemm_alpha, b_buffer, b_offset + i + block_size, b_ld); gemm2_event.WaitForCompletion(); @@ -172,7 +172,7 @@ void Xtrsm::TrsmColMajor(const Side side, const Triangle triangle, auto gemm2 = Xgemm(queue_, gemm2_event.pointer()); gemm2.DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, i, n, current_block_size, ConstantNegOne(), - a_buffer, this_a_offset, a_ld, + a_buffer, this_a_offset + a_offset, a_ld, x_buffer, x_offset + i, x_ld, gemm_alpha, b_buffer, b_offset, b_ld); gemm2_event.WaitForCompletion(); @@ -206,7 +206,7 @@ void Xtrsm::TrsmColMajor(const Side side, const Triangle triangle, gemm2.DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, m, i, current_block_size, ConstantNegOne(), x_buffer, x_offset + i * x_ld, x_ld, - a_buffer, this_a_offset, a_ld, gemm_alpha, + a_buffer, this_a_offset + a_offset, a_ld, gemm_alpha, b_buffer, b_offset, b_ld); gemm2_event.WaitForCompletion(); } @@ -233,7 +233,7 @@ void Xtrsm::TrsmColMajor(const Side side, const Triangle triangle, gemm2.DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, m, n - i - block_size, block_size, ConstantNegOne(), x_buffer, x_offset + i * x_ld, x_ld, - a_buffer, this_a_offset, a_ld, gemm_alpha, + a_buffer, this_a_offset + a_offset, a_ld, gemm_alpha, b_buffer, b_offset + (i + block_size) * b_ld, b_ld); gemm2_event.WaitForCompletion(); } -- cgit v1.2.3