summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG2
-rw-r--r--src/routines/level2/xtrsv.cpp11
2 files changed, 8 insertions, 5 deletions
diff --git a/CHANGELOG b/CHANGELOG
index a220fed1..4d1bb764 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,6 @@
Development (next version)
-- Fixed a bug in the TRSM routine due to missing synchronisations after GEMM calls
+- Fixed a bug in the TRSM/TRSV routines due to missing synchronisations after GEMM/GEMV calls
- Added a CUDA API to CLBlast:
* The library and kernels can be compiled with the CUDA driver API and NVRTC (requires CUDA 7.5)
* Two CUDA API sample programs are added: SGEMM and DAXPY
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
index d5d009ff..36c33a76 100644
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@@ -131,10 +131,13 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
if (i > 0) {
const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i;
const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size;
- DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(),
- a_buffer, a_offset + extra_offset_a, a_ld,
- x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
- x_buffer, x_offset + extra_offset_b, x_inc );
+ auto gemv_event = Event();
+ auto gemv = Xgemv<T>(queue_, gemv_event.pointer());
+ gemv.DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(),
+ a_buffer, a_offset + extra_offset_a, a_ld,
+ x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+ x_buffer, x_offset + extra_offset_b, x_inc);
+ gemv_event.WaitForCompletion();
}
// Runs the triangular substitution for the block size