diff options
-rw-r--r-- | CHANGELOG | 2 | ||||
-rw-r--r-- | src/routines/level2/xtrsv.cpp | 11 |
2 files changed, 8 insertions, 5 deletions
@@ -1,6 +1,6 @@ Development (next version) -- Fixed a bug in the TRSM routine due to missing synchronisations after GEMM calls +- Fixed a bug in the TRSM/TRSV routines due to missing synchronisations after GEMM/GEMV calls - Added a CUDA API to CLBlast: * The library and kernels can be compiled with the CUDA driver API and NVRTC (requires CUDA 7.5) * Two CUDA API sample programs are added: SGEMM and DAXPY diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp index d5d009ff..36c33a76 100644 --- a/src/routines/level2/xtrsv.cpp +++ b/src/routines/level2/xtrsv.cpp @@ -131,10 +131,13 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle, if (i > 0) { const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i; const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size; - DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(), - a_buffer, a_offset + extra_offset_a, a_ld, - x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(), - x_buffer, x_offset + extra_offset_b, x_inc ); + auto gemv_event = Event(); + auto gemv = Xgemv<T>(queue_, gemv_event.pointer()); + gemv.DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(), + a_buffer, a_offset + extra_offset_a, a_ld, + x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(), + x_buffer, x_offset + extra_offset_b, x_inc); + gemv_event.WaitForCompletion(); } // Runs the triangular substitution for the block size |