summaryrefslogtreecommitdiff
path: root/src/routines/level3/xgemm.cc
diff options
context:
space:
mode:
authorcnugteren <web@cedricnugteren.nl>2016-04-09 22:22:24 -0600
committercnugteren <web@cedricnugteren.nl>2016-04-09 22:22:24 -0600
commit1d3d38a2618c5663bf1549b08805137fd85f2efa (patch)
tree5de200346fc3d87c6e353d84744c59b2c703c16d /src/routines/level3/xgemm.cc
parentc2cfee76c4d8f7486d5b62b3e0a878867a32a070 (diff)
Events are now properly implemented using event waiting list and asking the user to wait for event completion
Diffstat (limited to 'src/routines/level3/xgemm.cc')
-rw-r--r--src/routines/level3/xgemm.cc28
1 files changed, 22 insertions, 6 deletions
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index 5dc2ad7f..7557dcc3 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -29,7 +29,7 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xgemm<T>::Xgemm(Queue &queue, Event &event, const std::string &name):
+Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/level3/copy.opencl"
@@ -122,30 +122,43 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
- status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+ auto eventProcessA = Event();
+ status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
program, true, a_do_transpose, a_conjugate);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventProcessA);
}
// As above, but now for matrix B
if (!b_no_temp) {
- status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
+ auto eventProcessB = Event();
+ status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
+ b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
program, true, b_do_transpose, b_conjugate);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventProcessB);
}
// As above, but now for matrix C. This is only necessary if C is used both as input and output.
if (!c_no_temp && beta != static_cast<T>(0)) {
- status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
+ auto eventProcessC = Event();
+ status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+ c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
program, true, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventProcessC);
}
// Retrieves the Xgemm kernel from the compiled binary
@@ -170,12 +183,15 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
- status = RunKernel(kernel, global, local);
+ auto eventKernel = Event();
+ status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventKernel);
// Runs the post-processing kernel if needed
if (!c_no_temp) {
- status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
+ status = PadCopyTransposeMatrix(event_, eventWaitList,
+ m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
program, false, c_do_transpose, false);
if (ErrorIn(status)) { return status; }