summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcnugteren <web@cedricnugteren.nl>2016-05-15 16:10:56 +0200
committercnugteren <web@cedricnugteren.nl>2016-05-15 16:10:56 +0200
commit716d7c67d91ef61e3d71e219f61c72859ac823eb (patch)
tree543af924d3e5557a05a434d332cc6205ee0f07ed
parent9e36b3b20d5bc69e8744e76f347a3f5e1345778a (diff)
Fixed a bug in the xGEMM routine related to the event incorrectly set
-rw-r--r--CHANGELOG1
-rw-r--r--src/routines/level3/xgemm.cc5
-rw-r--r--test/correctness/tester.cc2
3 files changed, 5 insertions, 3 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 187fca73..6de365bf 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,7 @@
Development version (next release)
- Improved performance of large power-of-2 xGEMM kernels for AMD GPUs
+- Fixed a bug in the xGEMM routine related to the event incorrectly set
Version 0.7.0
- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter)
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index aa081e81..3699b548 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -184,12 +184,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// Launches the kernel
auto eventKernel = Event();
- status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList);
+ auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
+ status = RunKernel(kernel, global, local, eventPointer, eventWaitList);
if (ErrorIn(status)) { return status; }
- eventWaitList.push_back(eventKernel);
// Runs the post-processing kernel if needed
if (!c_no_temp) {
+ eventWaitList.push_back(eventKernel);
status = PadCopyTransposeMatrix(event_, eventWaitList,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc
index 26c4ba59..85ae7091 100644
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@@ -334,7 +334,7 @@ bool TestSimilarity(const T val1, const T val2) {
// Set the allowed error margin for floating-point comparisons
constexpr auto kErrorMarginRelative = T(0.025);
- constexpr auto kErrorMarginAbsolute = T(1.0e-4);
+ constexpr auto kErrorMarginAbsolute = T(1.0e-3);
// Shortcut, handles infinities
if (val1 == val2) {