diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-06-17 12:32:06 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-06-17 12:32:06 +0200 |
commit | 98a95c89fc0633efdc8439c942762bef9a1e5e1d (patch) | |
tree | d37775c4bf07229f7eae38c335da85eaf0c468a9 /include | |
parent | 520e28e7a72f288f04d04d86d4e7560d78159820 (diff) |
Moved the RunKernel and PadCopyTransposeMatrix functions out of the Routine class
Diffstat (limited to 'include')
26 files changed, 204 insertions, 53 deletions
diff --git a/include/internal/routine.h b/include/internal/routine.h index 0f64c479..e1888f1f 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -40,30 +40,6 @@ class Routine { StatusCode SetUp(); protected: - - // Runs a kernel given the global and local thread sizes - StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global, - const std::vector<size_t> &local, EventPointer event, - std::vector<Event>& waitForEvents); - - // As above, but without an event waiting list - StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global, - const std::vector<size_t> &local, EventPointer event); - - // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write - // to symmetric and triangular matrices through optional arguments. - StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents, - const size_t src_one, const size_t src_two, - const size_t src_ld, const size_t src_offset, - const Buffer<T> &src, - const size_t dest_one, const size_t dest_two, - const size_t dest_ld, const size_t dest_offset, - const Buffer<T> &dest, - const T alpha, - const Program &program, const bool do_pad, - const bool do_transpose, const bool do_conjugate, - const bool upper = false, const bool lower = false, - const bool diagonal_imag_zero = false); // Stores a newly compiled binary/program into the cache void StoreBinaryToCache(const std::string& binary) const { @@ -105,16 +81,28 @@ class Routine { // OpenCL device properties const std::string device_name_; - const size_t max_work_item_dimensions_; - const std::vector<size_t> max_work_item_sizes_; - const size_t max_work_group_size_; // Connection to the database for all the device-specific parameters const Database db_; }; // ================================================================================================= + +// Enqueues a kernel, waits for completion, and checks for errors +StatusCode RunKernel(Kernel &kernel, Queue queue, const Device device, + std::vector<size_t> global, const std::vector<size_t> &local, + EventPointer event, std::vector<Event>& waitForEvents); + +// As above, but without an event waiting list +StatusCode RunKernel(Kernel &kernel, Queue queue, const Device device, + std::vector<size_t> global, const std::vector<size_t> &local, + EventPointer event); + +// ================================================================================================= } // namespace clblast +// Temporary fix: TODO place include in a more logical place +#include "internal/routines/common.h" + // CLBLAST_ROUTINE_H_ #endif diff --git a/include/internal/routines/common.h b/include/internal/routines/common.h new file mode 100644 index 00000000..95fbde46 --- /dev/null +++ b/include/internal/routines/common.h @@ -0,0 +1,159 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains all the interfaces to common kernels, such as copying, padding, and +// transposing a matrix. These functions are templated and thus header-only. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_COMMON_H_ +#define CLBLAST_ROUTINES_COMMON_H_ + +#include <string> +#include <vector> + +#include "internal/utilities.h" +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able +// to write to symmetric and triangular matrices through optional arguments. +template <typename T> +StatusCode PadCopyTransposeMatrix(Queue queue, const Device device, const Context context, + const Database db, + EventPointer event, std::vector<Event>& waitForEvents, + const size_t src_one, const size_t src_two, + const size_t src_ld, const size_t src_offset, + const Buffer<T> &src, + const size_t dest_one, const size_t dest_two, + const size_t dest_ld, const size_t dest_offset, + const Buffer<T> &dest, + const T alpha, + const Program &program, const bool do_pad, + const bool do_transpose, const bool do_conjugate, + const bool upper = false, const bool lower = false, + const bool diagonal_imag_zero = false) { + + // Determines whether or not the fast-version could potentially be used + auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && + (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && + (upper == false) && (lower == false) && (diagonal_imag_zero == false); + + // Determines the right kernel + auto kernel_name = std::string{}; + if (do_transpose) { + if (use_fast_kernel && + IsMultiple(src_ld, db["TRA_WPT"]) && + IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) && + IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) { + kernel_name = "TransposeMatrixFast"; + } + else { + use_fast_kernel = false; + kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; + } + } + else { + if (use_fast_kernel && + IsMultiple(src_ld, db["COPY_VW"]) && + IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) && + IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) { + kernel_name = "CopyMatrixFast"; + } + else { + use_fast_kernel = false; + kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; + } + } + + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer<T>(context, 1); + alpha_buffer.Write(queue, 1, &alpha); + + // Retrieves the kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + if (use_fast_kernel) { + kernel.SetArgument(0, static_cast<int>(src_ld)); + kernel.SetArgument(1, src()); + kernel.SetArgument(2, dest()); + kernel.SetArgument(3, alpha_buffer()); + } + else { + kernel.SetArgument(0, static_cast<int>(src_one)); + kernel.SetArgument(1, static_cast<int>(src_two)); + kernel.SetArgument(2, static_cast<int>(src_ld)); + kernel.SetArgument(3, static_cast<int>(src_offset)); + kernel.SetArgument(4, src()); + kernel.SetArgument(5, static_cast<int>(dest_one)); + kernel.SetArgument(6, static_cast<int>(dest_two)); + kernel.SetArgument(7, static_cast<int>(dest_ld)); + kernel.SetArgument(8, static_cast<int>(dest_offset)); + kernel.SetArgument(9, dest()); + kernel.SetArgument(10, alpha_buffer()); + if (do_pad) { + kernel.SetArgument(11, static_cast<int>(do_conjugate)); + } + else { + kernel.SetArgument(11, static_cast<int>(upper)); + kernel.SetArgument(12, static_cast<int>(lower)); + kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); + } + } + + // Launches the kernel and returns the error code. Uses global and local thread sizes based on + // parameters in the database. + if (do_transpose) { + if (use_fast_kernel) { + const auto global = std::vector<size_t>{ + dest_one / db["TRA_WPT"], + dest_two / db["TRA_WPT"] + }; + const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]}; + return RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + else { + const auto global = std::vector<size_t>{ + Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), + Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) + }; + const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]}; + return RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + } + else { + if (use_fast_kernel) { + const auto global = std::vector<size_t>{ + dest_one / db["COPY_VW"], + dest_two / db["COPY_WPT"] + }; + const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]}; + return RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + else { + const auto global = std::vector<size_t>{ + Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), + Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) + }; + const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]}; + return RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + } + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_COMMON_H_ +#endif diff --git a/include/internal/routines/level1/xamax.h b/include/internal/routines/level1/xamax.h index 70d8a6b0..54434362 100644 --- a/include/internal/routines/level1/xamax.h +++ b/include/internal/routines/level1/xamax.h @@ -28,10 +28,10 @@ class Xamax: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX"); diff --git a/include/internal/routines/level1/xasum.h b/include/internal/routines/level1/xasum.h index 8b5c9c76..ee593e30 100644 --- a/include/internal/routines/level1/xasum.h +++ b/include/internal/routines/level1/xasum.h @@ -28,10 +28,10 @@ class Xasum: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM"); diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h index bd8f5892..6ea3264b 100644 --- a/include/internal/routines/level1/xaxpy.h +++ b/include/internal/routines/level1/xaxpy.h @@ -28,10 +28,10 @@ class Xaxpy: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY"); diff --git a/include/internal/routines/level1/xcopy.h b/include/internal/routines/level1/xcopy.h index f01d5b08..b371ca9b 100644 --- a/include/internal/routines/level1/xcopy.h +++ b/include/internal/routines/level1/xcopy.h @@ -28,9 +28,9 @@ class Xcopy: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY"); diff --git a/include/internal/routines/level1/xdot.h b/include/internal/routines/level1/xdot.h index 46cf9959..7c69a902 100644 --- a/include/internal/routines/level1/xdot.h +++ b/include/internal/routines/level1/xdot.h @@ -28,10 +28,10 @@ class Xdot: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT"); diff --git a/include/internal/routines/level1/xnrm2.h b/include/internal/routines/level1/xnrm2.h index 3a0cf45c..f83cc2ce 100644 --- a/include/internal/routines/level1/xnrm2.h +++ b/include/internal/routines/level1/xnrm2.h @@ -28,10 +28,10 @@ class Xnrm2: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2"); diff --git a/include/internal/routines/level1/xscal.h b/include/internal/routines/level1/xscal.h index 9a0f83ab..40f017f2 100644 --- a/include/internal/routines/level1/xscal.h +++ b/include/internal/routines/level1/xscal.h @@ -28,9 +28,9 @@ class Xscal: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL"); diff --git a/include/internal/routines/level1/xswap.h b/include/internal/routines/level1/xswap.h index 02360c38..f794a1b4 100644 --- a/include/internal/routines/level1/xswap.h +++ b/include/internal/routines/level1/xswap.h @@ -28,9 +28,9 @@ class Xswap: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP"); diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h index dcfcbd1f..aec8b35b 100644 --- a/include/internal/routines/level2/xgemv.h +++ b/include/internal/routines/level2/xgemv.h @@ -28,10 +28,10 @@ class Xgemv: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV"); diff --git a/include/internal/routines/level2/xger.h b/include/internal/routines/level2/xger.h index a26ca24a..260325cb 100644 --- a/include/internal/routines/level2/xger.h +++ b/include/internal/routines/level2/xger.h @@ -28,10 +28,10 @@ class Xger: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xger(Queue &queue, EventPointer event, const std::string &name = "GER"); diff --git a/include/internal/routines/level2/xher.h b/include/internal/routines/level2/xher.h index d32d337e..d66b2603 100644 --- a/include/internal/routines/level2/xher.h +++ b/include/internal/routines/level2/xher.h @@ -28,10 +28,10 @@ class Xher: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xher(Queue &queue, EventPointer event, const std::string &name = "HER"); diff --git a/include/internal/routines/level2/xher2.h b/include/internal/routines/level2/xher2.h index 956ffe6a..35bf8190 100644 --- a/include/internal/routines/level2/xher2.h +++ b/include/internal/routines/level2/xher2.h @@ -28,10 +28,10 @@ class Xher2: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Constructor Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2"); diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h index 3b358080..c9107c25 100644 --- a/include/internal/routines/level2/xtbmv.h +++ b/include/internal/routines/level2/xtbmv.h @@ -28,6 +28,7 @@ class Xtbmv: public Xgemv<T> { // Members from the base class using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::context_; // Uses the generic matrix-vector routine diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h index f306cf4a..e85c225f 100644 --- a/include/internal/routines/level2/xtpmv.h +++ b/include/internal/routines/level2/xtpmv.h @@ -28,6 +28,7 @@ class Xtpmv: public Xgemv<T> { // Members from the base class using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::context_; // Uses the generic matrix-vector routine diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h index cf0824a4..97a180ff 100644 --- a/include/internal/routines/level2/xtrmv.h +++ b/include/internal/routines/level2/xtrmv.h @@ -28,6 +28,7 @@ class Xtrmv: public Xgemv<T> { // Members from the base class using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::context_; // Uses the generic matrix-vector routine diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h index 8facaa76..2fd853a9 100644 --- a/include/internal/routines/level3/xgemm.h +++ b/include/internal/routines/level3/xgemm.h @@ -28,11 +28,10 @@ class Xgemm: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::PadCopyTransposeMatrix; - using Routine<T>::RunKernel; // Constructor Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM"); diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h index cf833f57..8bd38393 100644 --- a/include/internal/routines/level3/xhemm.h +++ b/include/internal/routines/level3/xhemm.h @@ -27,9 +27,10 @@ class Xhemm: public Xgemm<T> { // Members and methods from the base class using Routine<T>::db_; + using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Uses the regular Xgemm routine using Xgemm<T>::DoGemm; diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h index cdba33ab..1afe87a6 100644 --- a/include/internal/routines/level3/xher2k.h +++ b/include/internal/routines/level3/xher2k.h @@ -30,11 +30,10 @@ class Xher2k: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::PadCopyTransposeMatrix; - using Routine<T>::RunKernel; // Constructor Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K"); diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h index a9d1615a..64abae3b 100644 --- a/include/internal/routines/level3/xherk.h +++ b/include/internal/routines/level3/xherk.h @@ -30,11 +30,10 @@ class Xherk: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::PadCopyTransposeMatrix; - using Routine<T>::RunKernel; // Constructor Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK"); diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h index ec145a89..c35dfb5e 100644 --- a/include/internal/routines/level3/xsymm.h +++ b/include/internal/routines/level3/xsymm.h @@ -29,9 +29,10 @@ class Xsymm: public Xgemm<T> { // Members and methods from the base class using Routine<T>::db_; + using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Uses the regular Xgemm routine using Xgemm<T>::DoGemm; diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h index 1093d635..73d11b0b 100644 --- a/include/internal/routines/level3/xsyr2k.h +++ b/include/internal/routines/level3/xsyr2k.h @@ -30,11 +30,10 @@ class Xsyr2k: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::PadCopyTransposeMatrix; - using Routine<T>::RunKernel; // Constructor Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K"); diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h index c2edbf2b..344c02e2 100644 --- a/include/internal/routines/level3/xsyrk.h +++ b/include/internal/routines/level3/xsyrk.h @@ -32,11 +32,10 @@ class Xsyrk: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::PadCopyTransposeMatrix; - using Routine<T>::RunKernel; // Constructor Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK"); diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h index 6cb46b01..5c12815d 100644 --- a/include/internal/routines/level3/xtrmm.h +++ b/include/internal/routines/level3/xtrmm.h @@ -28,9 +28,10 @@ class Xtrmm: public Xgemm<T> { // Members and methods from the base class using Routine<T>::db_; + using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::RunKernel; // Uses the regular Xgemm routine using Xgemm<T>::DoGemm; diff --git a/include/internal/routines/levelx/xomatcopy.h b/include/internal/routines/levelx/xomatcopy.h index ee38fa15..7c284635 100644 --- a/include/internal/routines/levelx/xomatcopy.h +++ b/include/internal/routines/levelx/xomatcopy.h @@ -25,10 +25,13 @@ class Xomatcopy: public Routine<T> { public: // Members and methods from the base class + using Routine<T>::db_; using Routine<T>::source_string_; + using Routine<T>::queue_; + using Routine<T>::device_; using Routine<T>::event_; + using Routine<T>::context_; using Routine<T>::GetProgramFromCache; - using Routine<T>::PadCopyTransposeMatrix; // Constructor Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY"); |