From 98a95c89fc0633efdc8439c942762bef9a1e5e1d Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 17 Jun 2016 12:32:06 +0200 Subject: Moved the RunKernel and PadCopyTransposeMatrix functions out of the Routine class --- src/routines/level1/xamax.cc | 4 ++-- src/routines/level1/xasum.cc | 4 ++-- src/routines/level1/xaxpy.cc | 4 ++-- src/routines/level1/xcopy.cc | 4 ++-- src/routines/level1/xdot.cc | 4 ++-- src/routines/level1/xnrm2.cc | 4 ++-- src/routines/level1/xscal.cc | 4 ++-- src/routines/level1/xswap.cc | 4 ++-- src/routines/level2/xgemv.cc | 2 +- src/routines/level2/xger.cc | 2 +- src/routines/level2/xher.cc | 2 +- src/routines/level2/xher2.cc | 2 +- src/routines/level3/xgemm.cc | 10 +++++----- src/routines/level3/xhemm.cc | 2 +- src/routines/level3/xher2k.cc | 16 ++++++++-------- src/routines/level3/xherk.cc | 10 +++++----- src/routines/level3/xsymm.cc | 2 +- src/routines/level3/xsyr2k.cc | 12 ++++++------ src/routines/level3/xsyrk.cc | 8 ++++---- src/routines/level3/xtrmm.cc | 2 +- src/routines/levelx/xomatcopy.cc | 2 +- 21 files changed, 52 insertions(+), 52 deletions(-) (limited to 'src/routines') diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc index 9a7d2173..6028d953 100644 --- a/src/routines/level1/xamax.cc +++ b/src/routines/level1/xamax.cc @@ -80,7 +80,7 @@ StatusCode Xamax::DoAmax(const size_t n, auto global1 = std::vector{db_["WGS1"]*temp_size}; auto local1 = std::vector{db_["WGS1"]}; auto kernelEvent = Event(); - status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } eventWaitList.push_back(kernelEvent); @@ -93,7 +93,7 @@ StatusCode Xamax::DoAmax(const size_t n, // Launches the epilogue kernel auto global2 = std::vector{db_["WGS2"]}; auto local2 = std::vector{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc index 3dcaa80a..6046a467 100644 --- a/src/routines/level1/xasum.cc +++ b/src/routines/level1/xasum.cc @@ -78,7 +78,7 @@ StatusCode Xasum::DoAsum(const size_t n, auto global1 = std::vector{db_["WGS1"]*temp_size}; auto local1 = std::vector{db_["WGS1"]}; auto kernelEvent = Event(); - status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } eventWaitList.push_back(kernelEvent); @@ -90,7 +90,7 @@ StatusCode Xasum::DoAsum(const size_t n, // Launches the epilogue kernel auto global2 = std::vector{db_["WGS2"]}; auto local2 = std::vector{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index b57001f9..dbc05cf7 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -94,13 +94,13 @@ StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, if (use_fast_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc index 273e87a6..8848201c 100644 --- a/src/routines/level1/xcopy.cc +++ b/src/routines/level1/xcopy.cc @@ -88,13 +88,13 @@ StatusCode Xcopy::DoCopy(const size_t n, if (use_fast_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc index 25eccadf..a819564a 100644 --- a/src/routines/level1/xdot.cc +++ b/src/routines/level1/xdot.cc @@ -86,7 +86,7 @@ StatusCode Xdot::DoDot(const size_t n, auto global1 = std::vector{db_["WGS1"]*temp_size}; auto local1 = std::vector{db_["WGS1"]}; auto kernelEvent = Event(); - status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } eventWaitList.push_back(kernelEvent); @@ -98,7 +98,7 @@ StatusCode Xdot::DoDot(const size_t n, // Launches the epilogue kernel auto global2 = std::vector{db_["WGS2"]}; auto local2 = std::vector{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc index eca283f8..8904c369 100644 --- a/src/routines/level1/xnrm2.cc +++ b/src/routines/level1/xnrm2.cc @@ -78,7 +78,7 @@ StatusCode Xnrm2::DoNrm2(const size_t n, auto global1 = std::vector{db_["WGS1"]*temp_size}; auto local1 = std::vector{db_["WGS1"]}; auto kernelEvent = Event(); - status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } eventWaitList.push_back(kernelEvent); @@ -90,7 +90,7 @@ StatusCode Xnrm2::DoNrm2(const size_t n, // Launches the epilogue kernel auto global2 = std::vector{db_["WGS2"]}; auto local2 = std::vector{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc index 0ce211b6..8078c076 100644 --- a/src/routines/level1/xscal.cc +++ b/src/routines/level1/xscal.cc @@ -82,13 +82,13 @@ StatusCode Xscal::DoScal(const size_t n, const T alpha, if (use_fast_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc index 773d78b5..01184db5 100644 --- a/src/routines/level1/xswap.cc +++ b/src/routines/level1/xswap.cc @@ -88,13 +88,13 @@ StatusCode Xswap::DoSwap(const size_t n, if (use_fast_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index 18e61f28..07c6ec9d 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -169,7 +169,7 @@ StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, // Launches the kernel auto global = std::vector{global_size}; auto local = std::vector{local_size}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc index 7d6fb091..c69efc23 100644 --- a/src/routines/level2/xger.cc +++ b/src/routines/level2/xger.cc @@ -94,7 +94,7 @@ StatusCode Xger::DoGer(const Layout layout, auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]); auto global = std::vector{a_one_ceiled, a_two_ceiled}; auto local = std::vector{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc index 3d5c0baf..ed8763dc 100644 --- a/src/routines/level2/xher.cc +++ b/src/routines/level2/xher.cc @@ -105,7 +105,7 @@ StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector{global_one, global_two}; auto local = std::vector{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc index a7362410..10b98329 100644 --- a/src/routines/level2/xher2.cc +++ b/src/routines/level2/xher2.cc @@ -96,7 +96,7 @@ StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector{global_one, global_two}; auto local = std::vector{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 713bed8f..eced53ab 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -142,7 +142,7 @@ StatusCode Xgemm::DoGemm(const Layout layout, // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, m_ceiled, k_ceiled, m_ceiled, 0, a_temp, ConstantOne(), program, @@ -154,7 +154,7 @@ StatusCode Xgemm::DoGemm(const Layout layout, // As above, but now for matrix B if (!b_no_temp) { auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, ConstantOne(), program, @@ -166,7 +166,7 @@ StatusCode Xgemm::DoGemm(const Layout layout, // As above, but now for matrix C. This is only necessary if C is used both as input and output. if (!c_no_temp && beta != static_cast(0)) { auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, c_one, c_two, c_ld, c_offset, c_buffer, m_ceiled, n_ceiled, m_ceiled, 0, c_temp, ConstantOne(), program, @@ -199,13 +199,13 @@ StatusCode Xgemm::DoGemm(const Layout layout, // Launches the kernel auto eventKernel = Event(); auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; - status = RunKernel(kernel, global, local, eventPointer, eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList); if (ErrorIn(status)) { return status; } // Runs the post-processing kernel if needed if (!c_no_temp) { eventWaitList.push_back(eventKernel); - status = PadCopyTransposeMatrix(event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, m_ceiled, n_ceiled, m_ceiled, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, ConstantOne(), program, diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc index a6e853e9..9791d7b4 100644 --- a/src/routines/level3/xhemm.cc +++ b/src/routines/level3/xhemm.cc @@ -80,7 +80,7 @@ StatusCode Xhemm::DoHemm(const Layout layout, const Side side, const Triangle Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; auto kernelEvent = Event(); - status = RunKernel(kernel, global, local, kernelEvent.pointer()); + status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } // Synchronize now: 'DoGemm' does not accept a list of events to wait for diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index c891c202..43f7bb76 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -129,7 +129,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // case nothing has to be done, these kernels can be skipped. if (!a1_no_temp) { auto eventProcessA1 = Event(); - status = PadCopyTransposeMatrix(eventProcessA1.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA1.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, ConstantOne(), program, @@ -139,7 +139,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co } if (!a2_no_temp) { auto eventProcessA2 = Event(); - status = PadCopyTransposeMatrix(eventProcessA2.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA2.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, ConstantOne(), program, @@ -149,7 +149,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co } if (!b1_no_temp) { auto eventProcessB1 = Event(); - status = PadCopyTransposeMatrix(eventProcessB1.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB1.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, ConstantOne(), program, @@ -159,7 +159,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co } if (!b2_no_temp) { auto eventProcessB2 = Event(); - status = PadCopyTransposeMatrix(eventProcessB2.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB2.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, ConstantOne(), program, @@ -171,7 +171,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne(), program, @@ -201,7 +201,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // Launches the kernel auto eventKernel1 = Event(); - status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel1); @@ -217,14 +217,14 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // Runs the kernel again auto eventKernel2 = Event(); - status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel2); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne(), program, diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index 9d64af95..8ebcbfa8 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -121,7 +121,7 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons // case nothing has to be done, these kernels can be skipped. Two copies are created. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, ConstantOne(), program, @@ -131,7 +131,7 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons } if (!b_no_temp) { auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, ConstantOne(), program, @@ -143,7 +143,7 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne(), program, @@ -173,14 +173,14 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons // Launches the kernel auto eventKernel = Event(); - status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne(), program, diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index 379e2908..650afbfc 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -80,7 +80,7 @@ StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; auto kernelEvent = Event(); - status = RunKernel(kernel, global, local, kernelEvent.pointer()); + status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } // Synchronize now: 'DoGemm' does not accept a list of events to wait for diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 886d1e16..4b436381 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -121,7 +121,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, ConstantOne(), program, @@ -131,7 +131,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons } if (!b_no_temp) { auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, ConstantOne(), program, @@ -143,7 +143,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne(), program, @@ -173,7 +173,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // Launches the kernel auto eventKernel1 = Event(); - status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel1); @@ -186,14 +186,14 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // Runs the kernel again auto eventKernel2 = Event(); - status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel2); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne(), program, diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index 000347f3..5c49795b 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -114,7 +114,7 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, ConstantOne(), program, @@ -126,7 +126,7 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne(), program, @@ -156,14 +156,14 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const // Launches the kernel auto eventKernel = Event(); - status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne(), program, diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index c62305aa..98e0622d 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -83,7 +83,7 @@ StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; auto kernelEvent = Event(); - status = RunKernel(kernel, global, local, kernelEvent.pointer()); + status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } // Synchronize now: 'DoGemm' does not accept a list of events to wait for diff --git a/src/routines/levelx/xomatcopy.cc b/src/routines/levelx/xomatcopy.cc index dcc4e52a..199a4903 100644 --- a/src/routines/levelx/xomatcopy.cc +++ b/src/routines/levelx/xomatcopy.cc @@ -81,7 +81,7 @@ StatusCode Xomatcopy::DoOmatcopy(const Layout layout, const Transpose a_trans const auto program = GetProgramFromCache(); auto emptyEventList = std::vector(); - status = PadCopyTransposeMatrix(event_, emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, b_one, b_two, b_ld, b_offset, b_buffer, alpha, program, false, transpose, conjugate); -- cgit v1.2.3