diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/routine.cc | 158 | ||||
-rw-r--r-- | src/routines/level1/xamax.cc | 4 | ||||
-rw-r--r-- | src/routines/level1/xasum.cc | 4 | ||||
-rw-r--r-- | src/routines/level1/xaxpy.cc | 4 | ||||
-rw-r--r-- | src/routines/level1/xcopy.cc | 4 | ||||
-rw-r--r-- | src/routines/level1/xdot.cc | 4 | ||||
-rw-r--r-- | src/routines/level1/xnrm2.cc | 4 | ||||
-rw-r--r-- | src/routines/level1/xscal.cc | 4 | ||||
-rw-r--r-- | src/routines/level1/xswap.cc | 4 | ||||
-rw-r--r-- | src/routines/level2/xgemv.cc | 2 | ||||
-rw-r--r-- | src/routines/level2/xger.cc | 2 | ||||
-rw-r--r-- | src/routines/level2/xher.cc | 2 | ||||
-rw-r--r-- | src/routines/level2/xher2.cc | 2 | ||||
-rw-r--r-- | src/routines/level3/xgemm.cc | 10 | ||||
-rw-r--r-- | src/routines/level3/xhemm.cc | 2 | ||||
-rw-r--r-- | src/routines/level3/xher2k.cc | 16 | ||||
-rw-r--r-- | src/routines/level3/xherk.cc | 10 | ||||
-rw-r--r-- | src/routines/level3/xsymm.cc | 2 | ||||
-rw-r--r-- | src/routines/level3/xsyr2k.cc | 12 | ||||
-rw-r--r-- | src/routines/level3/xsyrk.cc | 8 | ||||
-rw-r--r-- | src/routines/level3/xtrmm.cc | 2 | ||||
-rw-r--r-- | src/routines/levelx/xomatcopy.cc | 2 |
22 files changed, 66 insertions, 196 deletions
diff --git a/src/routine.cc b/src/routine.cc index ee3ba341..c59cbc11 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -30,9 +30,6 @@ Routine<T>::Routine(Queue &queue, EventPointer event, const std::string &name, context_(queue_.GetContext()), device_(queue_.GetDevice()), device_name_(device_.Name()), - max_work_item_dimensions_(device_.MaxWorkItemDimensions()), - max_work_item_sizes_(device_.MaxWorkItemSizes()), - max_work_group_size_(device_.MaxWorkGroupSize()), db_(queue_, routines, precision_) { } @@ -135,21 +132,21 @@ StatusCode Routine<T>::SetUp() { // ================================================================================================= // Enqueues a kernel, waits for completion, and checks for errors -template <typename T> -StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global, - const std::vector<size_t> &local, EventPointer event, - std::vector<Event>& waitForEvents) { +StatusCode RunKernel(Kernel &kernel, Queue queue, const Device device, + std::vector<size_t> global, const std::vector<size_t> &local, + EventPointer event, std::vector<Event>& waitForEvents) { // Tests for validity of the local thread sizes - if (local.size() > max_work_item_dimensions_) { + if (local.size() > device.MaxWorkItemDimensions()) { return StatusCode::kInvalidLocalNumDimensions; } + const auto max_work_item_sizes = device.MaxWorkItemSizes(); for (auto i=size_t{0}; i<local.size(); ++i) { - if (local[i] > max_work_item_sizes_[i]) { return StatusCode::kInvalidLocalThreadsDim; } + if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; } } auto local_size = size_t{1}; for (auto &item: local) { local_size *= item; } - if (local_size > max_work_group_size_) { return StatusCode::kInvalidLocalThreadsTotal; } + if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; } // Make sure the global thread sizes are at least equal to the local sizes for (auto i=size_t{0}; i<global.size(); ++i) { @@ -157,12 +154,12 @@ StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global, } // Tests for local memory usage - const auto local_mem_usage = kernel.LocalMemUsage(device_); - if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; } + const auto local_mem_usage = kernel.LocalMemUsage(device); + if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; } // Launches the kernel (and checks for launch errors) try { - kernel.Launch(queue_, global, local, event, waitForEvents); + kernel.Launch(queue, global, local, event, waitForEvents); } catch (...) { return StatusCode::kKernelLaunchError; } // No errors, normal termination of this function @@ -170,138 +167,11 @@ StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global, } // As above, but without an event waiting list -template <typename T> -StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global, - const std::vector<size_t> &local, EventPointer event) { +StatusCode RunKernel(Kernel &kernel, Queue queue, const Device device, + std::vector<size_t> global, const std::vector<size_t> &local, + EventPointer event) { auto emptyWaitingList = std::vector<Event>(); - return RunKernel(kernel, global, local, event, emptyWaitingList); -} - -// ================================================================================================= - -// Copies or transposes a matrix and optionally pads/unpads it with zeros -template <typename T> -StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents, - const size_t src_one, const size_t src_two, - const size_t src_ld, const size_t src_offset, - const Buffer<T> &src, - const size_t dest_one, const size_t dest_two, - const size_t dest_ld, const size_t dest_offset, - const Buffer<T> &dest, - const T alpha, - const Program &program, const bool do_pad, - const bool do_transpose, const bool do_conjugate, - const bool upper, const bool lower, - const bool diagonal_imag_zero) { - - // Determines whether or not the fast-version could potentially be used - auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && - (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && - (upper == false) && (lower == false) && (diagonal_imag_zero == false); - - // Determines the right kernel - auto kernel_name = std::string{}; - if (do_transpose) { - if (use_fast_kernel && - IsMultiple(src_ld, db_["TRA_WPT"]) && - IsMultiple(src_one, db_["TRA_WPT"]*db_["TRA_WPT"]) && - IsMultiple(src_two, db_["TRA_WPT"]*db_["TRA_WPT"])) { - kernel_name = "TransposeMatrixFast"; - } - else { - use_fast_kernel = false; - kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; - } - } - else { - if (use_fast_kernel && - IsMultiple(src_ld, db_["COPY_VW"]) && - IsMultiple(src_one, db_["COPY_VW"]*db_["COPY_DIMX"]) && - IsMultiple(src_two, db_["COPY_WPT"]*db_["COPY_DIMY"])) { - kernel_name = "CopyMatrixFast"; - } - else { - use_fast_kernel = false; - kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; - } - } - - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer<T>(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - - // Retrieves the kernel from the compiled binary - try { - auto kernel = Kernel(program, kernel_name); - - // Sets the kernel arguments - if (use_fast_kernel) { - kernel.SetArgument(0, static_cast<int>(src_ld)); - kernel.SetArgument(1, src()); - kernel.SetArgument(2, dest()); - kernel.SetArgument(3, alpha_buffer()); - } - else { - kernel.SetArgument(0, static_cast<int>(src_one)); - kernel.SetArgument(1, static_cast<int>(src_two)); - kernel.SetArgument(2, static_cast<int>(src_ld)); - kernel.SetArgument(3, static_cast<int>(src_offset)); - kernel.SetArgument(4, src()); - kernel.SetArgument(5, static_cast<int>(dest_one)); - kernel.SetArgument(6, static_cast<int>(dest_two)); - kernel.SetArgument(7, static_cast<int>(dest_ld)); - kernel.SetArgument(8, static_cast<int>(dest_offset)); - kernel.SetArgument(9, dest()); - kernel.SetArgument(10, alpha_buffer()); - if (do_pad) { - kernel.SetArgument(11, static_cast<int>(do_conjugate)); - } - else { - kernel.SetArgument(11, static_cast<int>(upper)); - kernel.SetArgument(12, static_cast<int>(lower)); - kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); - } - } - - // Launches the kernel and returns the error code. Uses global and local thread sizes based on - // parameters in the database. - if (do_transpose) { - if (use_fast_kernel) { - const auto global = std::vector<size_t>{ - dest_one / db_["TRA_WPT"], - dest_two / db_["TRA_WPT"] - }; - const auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]}; - return RunKernel(kernel, global, local, event, waitForEvents); - } - else { - const auto global = std::vector<size_t>{ - Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]), - Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]) - }; - const auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]}; - return RunKernel(kernel, global, local, event, waitForEvents); - } - } - else { - if (use_fast_kernel) { - const auto global = std::vector<size_t>{ - dest_one / db_["COPY_VW"], - dest_two / db_["COPY_WPT"] - }; - const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]}; - return RunKernel(kernel, global, local, event, waitForEvents); - } - else { - const auto global = std::vector<size_t>{ - Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"]) - }; - const auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - return RunKernel(kernel, global, local, event, waitForEvents); - } - } - } catch (...) { return StatusCode::kInvalidKernel; } + return RunKernel(kernel, queue, device, global, local, event, emptyWaitingList); } // ================================================================================================= diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc index 9a7d2173..6028d953 100644 --- a/src/routines/level1/xamax.cc +++ b/src/routines/level1/xamax.cc @@ -80,7 +80,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n, auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; auto local1 = std::vector<size_t>{db_["WGS1"]}; auto kernelEvent = Event(); - status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } eventWaitList.push_back(kernelEvent); @@ -93,7 +93,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n, // Launches the epilogue kernel auto global2 = std::vector<size_t>{db_["WGS2"]}; auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc index 3dcaa80a..6046a467 100644 --- a/src/routines/level1/xasum.cc +++ b/src/routines/level1/xasum.cc @@ -78,7 +78,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n, auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; auto local1 = std::vector<size_t>{db_["WGS1"]}; auto kernelEvent = Event(); - status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } eventWaitList.push_back(kernelEvent); @@ -90,7 +90,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n, // Launches the epilogue kernel auto global2 = std::vector<size_t>{db_["WGS2"]}; auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index b57001f9..dbc05cf7 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -94,13 +94,13 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, if (use_fast_kernel) { auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc index 273e87a6..8848201c 100644 --- a/src/routines/level1/xcopy.cc +++ b/src/routines/level1/xcopy.cc @@ -88,13 +88,13 @@ StatusCode Xcopy<T>::DoCopy(const size_t n, if (use_fast_kernel) { auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc index 25eccadf..a819564a 100644 --- a/src/routines/level1/xdot.cc +++ b/src/routines/level1/xdot.cc @@ -86,7 +86,7 @@ StatusCode Xdot<T>::DoDot(const size_t n, auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; auto local1 = std::vector<size_t>{db_["WGS1"]}; auto kernelEvent = Event(); - status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } eventWaitList.push_back(kernelEvent); @@ -98,7 +98,7 @@ StatusCode Xdot<T>::DoDot(const size_t n, // Launches the epilogue kernel auto global2 = std::vector<size_t>{db_["WGS2"]}; auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc index eca283f8..8904c369 100644 --- a/src/routines/level1/xnrm2.cc +++ b/src/routines/level1/xnrm2.cc @@ -78,7 +78,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n, auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; auto local1 = std::vector<size_t>{db_["WGS1"]}; auto kernelEvent = Event(); - status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } eventWaitList.push_back(kernelEvent); @@ -90,7 +90,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n, // Launches the epilogue kernel auto global2 = std::vector<size_t>{db_["WGS2"]}; auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc index 0ce211b6..8078c076 100644 --- a/src/routines/level1/xscal.cc +++ b/src/routines/level1/xscal.cc @@ -82,13 +82,13 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha, if (use_fast_kernel) { auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc index 773d78b5..01184db5 100644 --- a/src/routines/level1/xswap.cc +++ b/src/routines/level1/xswap.cc @@ -88,13 +88,13 @@ StatusCode Xswap<T>::DoSwap(const size_t n, if (use_fast_kernel) { auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index 18e61f28..07c6ec9d 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -169,7 +169,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, // Launches the kernel auto global = std::vector<size_t>{global_size}; auto local = std::vector<size_t>{local_size}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc index 7d6fb091..c69efc23 100644 --- a/src/routines/level2/xger.cc +++ b/src/routines/level2/xger.cc @@ -94,7 +94,7 @@ StatusCode Xger<T>::DoGer(const Layout layout, auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]); auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled}; auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc index 3d5c0baf..ed8763dc 100644 --- a/src/routines/level2/xher.cc +++ b/src/routines/level2/xher.cc @@ -105,7 +105,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector<size_t>{global_one, global_two}; auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc index a7362410..10b98329 100644 --- a/src/routines/level2/xher2.cc +++ b/src/routines/level2/xher2.cc @@ -96,7 +96,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector<size_t>{global_one, global_two}; auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local, event_); + status = RunKernel(kernel, queue_, device_, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 713bed8f..eced53ab 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -142,7 +142,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, m_ceiled, k_ceiled, m_ceiled, 0, a_temp, ConstantOne<T>(), program, @@ -154,7 +154,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // As above, but now for matrix B if (!b_no_temp) { auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, ConstantOne<T>(), program, @@ -166,7 +166,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // As above, but now for matrix C. This is only necessary if C is used both as input and output. if (!c_no_temp && beta != static_cast<T>(0)) { auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, c_one, c_two, c_ld, c_offset, c_buffer, m_ceiled, n_ceiled, m_ceiled, 0, c_temp, ConstantOne<T>(), program, @@ -199,13 +199,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // Launches the kernel auto eventKernel = Event(); auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; - status = RunKernel(kernel, global, local, eventPointer, eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList); if (ErrorIn(status)) { return status; } // Runs the post-processing kernel if needed if (!c_no_temp) { eventWaitList.push_back(eventKernel); - status = PadCopyTransposeMatrix(event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, m_ceiled, n_ceiled, m_ceiled, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, ConstantOne<T>(), program, diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc index a6e853e9..9791d7b4 100644 --- a/src/routines/level3/xhemm.cc +++ b/src/routines/level3/xhemm.cc @@ -80,7 +80,7 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; auto kernelEvent = Event(); - status = RunKernel(kernel, global, local, kernelEvent.pointer()); + status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } // Synchronize now: 'DoGemm' does not accept a list of events to wait for diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index c891c202..43f7bb76 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -129,7 +129,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co // case nothing has to be done, these kernels can be skipped. if (!a1_no_temp) { auto eventProcessA1 = Event(); - status = PadCopyTransposeMatrix(eventProcessA1.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA1.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, ConstantOne<T>(), program, @@ -139,7 +139,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co } if (!a2_no_temp) { auto eventProcessA2 = Event(); - status = PadCopyTransposeMatrix(eventProcessA2.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA2.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, ConstantOne<T>(), program, @@ -149,7 +149,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co } if (!b1_no_temp) { auto eventProcessB1 = Event(); - status = PadCopyTransposeMatrix(eventProcessB1.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB1.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, ConstantOne<T>(), program, @@ -159,7 +159,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co } if (!b2_no_temp) { auto eventProcessB2 = Event(); - status = PadCopyTransposeMatrix(eventProcessB2.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB2.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, ConstantOne<T>(), program, @@ -171,7 +171,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne<T>(), program, @@ -201,7 +201,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co // Launches the kernel auto eventKernel1 = Event(); - status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel1); @@ -217,14 +217,14 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co // Runs the kernel again auto eventKernel2 = Event(); - status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel2); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne<T>(), program, diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index 9d64af95..8ebcbfa8 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -121,7 +121,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons // case nothing has to be done, these kernels can be skipped. Two copies are created. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, ConstantOne<T>(), program, @@ -131,7 +131,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons } if (!b_no_temp) { auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, ConstantOne<T>(), program, @@ -143,7 +143,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne<T>(), program, @@ -173,14 +173,14 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons // Launches the kernel auto eventKernel = Event(); - status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne<T>(), program, diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index 379e2908..650afbfc 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -80,7 +80,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; auto kernelEvent = Event(); - status = RunKernel(kernel, global, local, kernelEvent.pointer()); + status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } // Synchronize now: 'DoGemm' does not accept a list of events to wait for diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 886d1e16..4b436381 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -121,7 +121,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, ConstantOne<T>(), program, @@ -131,7 +131,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons } if (!b_no_temp) { auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, ConstantOne<T>(), program, @@ -143,7 +143,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne<T>(), program, @@ -173,7 +173,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons // Launches the kernel auto eventKernel1 = Event(); - status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel1); @@ -186,14 +186,14 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons // Runs the kernel again auto eventKernel2 = Event(); - status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel2); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne<T>(), program, diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index 000347f3..5c49795b 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -114,7 +114,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, ConstantOne<T>(), program, @@ -126,7 +126,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne<T>(), program, @@ -156,14 +156,14 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const // Launches the kernel auto eventKernel = Event(); - status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); + status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } eventWaitList.push_back(eventKernel); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne<T>(), program, diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index c62305aa..98e0622d 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -83,7 +83,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; auto kernelEvent = Event(); - status = RunKernel(kernel, global, local, kernelEvent.pointer()); + status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } // Synchronize now: 'DoGemm' does not accept a list of events to wait for diff --git a/src/routines/levelx/xomatcopy.cc b/src/routines/levelx/xomatcopy.cc index dcc4e52a..199a4903 100644 --- a/src/routines/levelx/xomatcopy.cc +++ b/src/routines/levelx/xomatcopy.cc @@ -81,7 +81,7 @@ StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_trans const auto program = GetProgramFromCache(); auto emptyEventList = std::vector<Event>(); - status = PadCopyTransposeMatrix(event_, emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, b_one, b_two, b_ld, b_offset, b_buffer, alpha, program, false, transpose, conjugate); |