diff options
author | cnugteren <web@cedricnugteren.nl> | 2016-04-09 22:22:24 -0600 |
---|---|---|
committer | cnugteren <web@cedricnugteren.nl> | 2016-04-09 22:22:24 -0600 |
commit | 1d3d38a2618c5663bf1549b08805137fd85f2efa (patch) | |
tree | 5de200346fc3d87c6e353d84744c59b2c703c16d /src/routines/level3/xherk.cc | |
parent | c2cfee76c4d8f7486d5b62b3e0a878867a32a070 (diff) |
Events are now properly implemented using event waiting list and asking the user to wait for event completion
Diffstat (limited to 'src/routines/level3/xherk.cc')
-rw-r--r-- | src/routines/level3/xherk.cc | 28 |
1 files changed, 22 insertions, 6 deletions
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index cbd0a188..574debe4 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -27,7 +27,7 @@ template <> const Precision Xherk<double2,double>::precision_ = Precision::kComp // Constructor: forwards to base class constructor template <typename T, typename U> -Xherk<T,U>::Xherk(Queue &queue, Event &event, const std::string &name): +Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/level3/copy.opencl" @@ -103,27 +103,40 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. Two copies are created. if (!a_no_temp) { - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, program, true, a_rotated, a_conjugate); + eventWaitList.push_back(eventProcessA); if (ErrorIn(status)) { return status; } } if (!b_no_temp) { - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + auto eventProcessB = Event(); + status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, program, true, a_rotated, b_conjugate); + eventWaitList.push_back(eventProcessB); if (ErrorIn(status)) { return status; } } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, program, true, c_rotated, false); + eventWaitList.push_back(eventProcessC); if (ErrorIn(status)) { return status; } // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary @@ -149,13 +162,16 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel - status = RunKernel(kernel, global, local); + auto eventKernel = Event(); + status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, program, false, c_rotated, false, upper, lower, true); if (ErrorIn(status)) { return status; } |