summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-06-17 12:32:06 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-06-17 12:32:06 +0200
commit98a95c89fc0633efdc8439c942762bef9a1e5e1d (patch)
treed37775c4bf07229f7eae38c335da85eaf0c468a9 /include
parent520e28e7a72f288f04d04d86d4e7560d78159820 (diff)
Moved the RunKernel and PadCopyTransposeMatrix functions out of the Routine class
Diffstat (limited to 'include')
-rw-r--r--include/internal/routine.h42
-rw-r--r--include/internal/routines/common.h159
-rw-r--r--include/internal/routines/level1/xamax.h2
-rw-r--r--include/internal/routines/level1/xasum.h2
-rw-r--r--include/internal/routines/level1/xaxpy.h2
-rw-r--r--include/internal/routines/level1/xcopy.h2
-rw-r--r--include/internal/routines/level1/xdot.h2
-rw-r--r--include/internal/routines/level1/xnrm2.h2
-rw-r--r--include/internal/routines/level1/xscal.h2
-rw-r--r--include/internal/routines/level1/xswap.h2
-rw-r--r--include/internal/routines/level2/xgemv.h2
-rw-r--r--include/internal/routines/level2/xger.h2
-rw-r--r--include/internal/routines/level2/xher.h2
-rw-r--r--include/internal/routines/level2/xher2.h2
-rw-r--r--include/internal/routines/level2/xtbmv.h1
-rw-r--r--include/internal/routines/level2/xtpmv.h1
-rw-r--r--include/internal/routines/level2/xtrmv.h1
-rw-r--r--include/internal/routines/level3/xgemm.h3
-rw-r--r--include/internal/routines/level3/xhemm.h3
-rw-r--r--include/internal/routines/level3/xher2k.h3
-rw-r--r--include/internal/routines/level3/xherk.h3
-rw-r--r--include/internal/routines/level3/xsymm.h3
-rw-r--r--include/internal/routines/level3/xsyr2k.h3
-rw-r--r--include/internal/routines/level3/xsyrk.h3
-rw-r--r--include/internal/routines/level3/xtrmm.h3
-rw-r--r--include/internal/routines/levelx/xomatcopy.h5
26 files changed, 204 insertions, 53 deletions
diff --git a/include/internal/routine.h b/include/internal/routine.h
index 0f64c479..e1888f1f 100644
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@@ -40,30 +40,6 @@ class Routine {
StatusCode SetUp();
protected:
-
- // Runs a kernel given the global and local thread sizes
- StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global,
- const std::vector<size_t> &local, EventPointer event,
- std::vector<Event>& waitForEvents);
-
- // As above, but without an event waiting list
- StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global,
- const std::vector<size_t> &local, EventPointer event);
-
- // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
- // to symmetric and triangular matrices through optional arguments.
- StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
- const size_t src_one, const size_t src_two,
- const size_t src_ld, const size_t src_offset,
- const Buffer<T> &src,
- const size_t dest_one, const size_t dest_two,
- const size_t dest_ld, const size_t dest_offset,
- const Buffer<T> &dest,
- const T alpha,
- const Program &program, const bool do_pad,
- const bool do_transpose, const bool do_conjugate,
- const bool upper = false, const bool lower = false,
- const bool diagonal_imag_zero = false);
// Stores a newly compiled binary/program into the cache
void StoreBinaryToCache(const std::string& binary) const {
@@ -105,16 +81,28 @@ class Routine {
// OpenCL device properties
const std::string device_name_;
- const size_t max_work_item_dimensions_;
- const std::vector<size_t> max_work_item_sizes_;
- const size_t max_work_group_size_;
// Connection to the database for all the device-specific parameters
const Database db_;
};
// =================================================================================================
+
+// Enqueues a kernel, waits for completion, and checks for errors
+StatusCode RunKernel(Kernel &kernel, Queue queue, const Device device,
+ std::vector<size_t> global, const std::vector<size_t> &local,
+ EventPointer event, std::vector<Event>& waitForEvents);
+
+// As above, but without an event waiting list
+StatusCode RunKernel(Kernel &kernel, Queue queue, const Device device,
+ std::vector<size_t> global, const std::vector<size_t> &local,
+ EventPointer event);
+
+// =================================================================================================
} // namespace clblast
+// Temporary fix: TODO place include in a more logical place
+#include "internal/routines/common.h"
+
// CLBLAST_ROUTINE_H_
#endif
diff --git a/include/internal/routines/common.h b/include/internal/routines/common.h
new file mode 100644
index 00000000..95fbde46
--- /dev/null
+++ b/include/internal/routines/common.h
@@ -0,0 +1,159 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the interfaces to common kernels, such as copying, padding, and
+// transposing a matrix. These functions are templated and thus header-only.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_COMMON_H_
+#define CLBLAST_ROUTINES_COMMON_H_
+
+#include <string>
+#include <vector>
+
+#include "internal/utilities.h"
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
+// to write to symmetric and triangular matrices through optional arguments.
+template <typename T>
+StatusCode PadCopyTransposeMatrix(Queue queue, const Device device, const Context context,
+ const Database db,
+ EventPointer event, std::vector<Event>& waitForEvents,
+ const size_t src_one, const size_t src_two,
+ const size_t src_ld, const size_t src_offset,
+ const Buffer<T> &src,
+ const size_t dest_one, const size_t dest_two,
+ const size_t dest_ld, const size_t dest_offset,
+ const Buffer<T> &dest,
+ const T alpha,
+ const Program &program, const bool do_pad,
+ const bool do_transpose, const bool do_conjugate,
+ const bool upper = false, const bool lower = false,
+ const bool diagonal_imag_zero = false) {
+
+ // Determines whether or not the fast-version could potentially be used
+ auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
+ (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
+ (upper == false) && (lower == false) && (diagonal_imag_zero == false);
+
+ // Determines the right kernel
+ auto kernel_name = std::string{};
+ if (do_transpose) {
+ if (use_fast_kernel &&
+ IsMultiple(src_ld, db["TRA_WPT"]) &&
+ IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
+ IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
+ kernel_name = "TransposeMatrixFast";
+ }
+ else {
+ use_fast_kernel = false;
+ kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
+ }
+ }
+ else {
+ if (use_fast_kernel &&
+ IsMultiple(src_ld, db["COPY_VW"]) &&
+ IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) &&
+ IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) {
+ kernel_name = "CopyMatrixFast";
+ }
+ else {
+ use_fast_kernel = false;
+ kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
+ }
+ }
+
+ // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+ auto alpha_buffer = Buffer<T>(context, 1);
+ alpha_buffer.Write(queue, 1, &alpha);
+
+ // Retrieves the kernel from the compiled binary
+ try {
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ if (use_fast_kernel) {
+ kernel.SetArgument(0, static_cast<int>(src_ld));
+ kernel.SetArgument(1, src());
+ kernel.SetArgument(2, dest());
+ kernel.SetArgument(3, alpha_buffer());
+ }
+ else {
+ kernel.SetArgument(0, static_cast<int>(src_one));
+ kernel.SetArgument(1, static_cast<int>(src_two));
+ kernel.SetArgument(2, static_cast<int>(src_ld));
+ kernel.SetArgument(3, static_cast<int>(src_offset));
+ kernel.SetArgument(4, src());
+ kernel.SetArgument(5, static_cast<int>(dest_one));
+ kernel.SetArgument(6, static_cast<int>(dest_two));
+ kernel.SetArgument(7, static_cast<int>(dest_ld));
+ kernel.SetArgument(8, static_cast<int>(dest_offset));
+ kernel.SetArgument(9, dest());
+ kernel.SetArgument(10, alpha_buffer());
+ if (do_pad) {
+ kernel.SetArgument(11, static_cast<int>(do_conjugate));
+ }
+ else {
+ kernel.SetArgument(11, static_cast<int>(upper));
+ kernel.SetArgument(12, static_cast<int>(lower));
+ kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
+ }
+ }
+
+ // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+ // parameters in the database.
+ if (do_transpose) {
+ if (use_fast_kernel) {
+ const auto global = std::vector<size_t>{
+ dest_one / db["TRA_WPT"],
+ dest_two / db["TRA_WPT"]
+ };
+ const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
+ return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ else {
+ const auto global = std::vector<size_t>{
+ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+ Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
+ };
+ const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
+ return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ }
+ else {
+ if (use_fast_kernel) {
+ const auto global = std::vector<size_t>{
+ dest_one / db["COPY_VW"],
+ dest_two / db["COPY_WPT"]
+ };
+ const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
+ return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ else {
+ const auto global = std::vector<size_t>{
+ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+ Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
+ };
+ const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
+ return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+ }
+ }
+ } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_COMMON_H_
+#endif
diff --git a/include/internal/routines/level1/xamax.h b/include/internal/routines/level1/xamax.h
index 70d8a6b0..54434362 100644
--- a/include/internal/routines/level1/xamax.h
+++ b/include/internal/routines/level1/xamax.h
@@ -28,10 +28,10 @@ class Xamax: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
diff --git a/include/internal/routines/level1/xasum.h b/include/internal/routines/level1/xasum.h
index 8b5c9c76..ee593e30 100644
--- a/include/internal/routines/level1/xasum.h
+++ b/include/internal/routines/level1/xasum.h
@@ -28,10 +28,10 @@ class Xasum: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h
index bd8f5892..6ea3264b 100644
--- a/include/internal/routines/level1/xaxpy.h
+++ b/include/internal/routines/level1/xaxpy.h
@@ -28,10 +28,10 @@ class Xaxpy: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
diff --git a/include/internal/routines/level1/xcopy.h b/include/internal/routines/level1/xcopy.h
index f01d5b08..b371ca9b 100644
--- a/include/internal/routines/level1/xcopy.h
+++ b/include/internal/routines/level1/xcopy.h
@@ -28,9 +28,9 @@ class Xcopy: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
diff --git a/include/internal/routines/level1/xdot.h b/include/internal/routines/level1/xdot.h
index 46cf9959..7c69a902 100644
--- a/include/internal/routines/level1/xdot.h
+++ b/include/internal/routines/level1/xdot.h
@@ -28,10 +28,10 @@ class Xdot: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
diff --git a/include/internal/routines/level1/xnrm2.h b/include/internal/routines/level1/xnrm2.h
index 3a0cf45c..f83cc2ce 100644
--- a/include/internal/routines/level1/xnrm2.h
+++ b/include/internal/routines/level1/xnrm2.h
@@ -28,10 +28,10 @@ class Xnrm2: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
diff --git a/include/internal/routines/level1/xscal.h b/include/internal/routines/level1/xscal.h
index 9a0f83ab..40f017f2 100644
--- a/include/internal/routines/level1/xscal.h
+++ b/include/internal/routines/level1/xscal.h
@@ -28,9 +28,9 @@ class Xscal: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
diff --git a/include/internal/routines/level1/xswap.h b/include/internal/routines/level1/xswap.h
index 02360c38..f794a1b4 100644
--- a/include/internal/routines/level1/xswap.h
+++ b/include/internal/routines/level1/xswap.h
@@ -28,9 +28,9 @@ class Xswap: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h
index dcfcbd1f..aec8b35b 100644
--- a/include/internal/routines/level2/xgemv.h
+++ b/include/internal/routines/level2/xgemv.h
@@ -28,10 +28,10 @@ class Xgemv: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
diff --git a/include/internal/routines/level2/xger.h b/include/internal/routines/level2/xger.h
index a26ca24a..260325cb 100644
--- a/include/internal/routines/level2/xger.h
+++ b/include/internal/routines/level2/xger.h
@@ -28,10 +28,10 @@ class Xger: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
diff --git a/include/internal/routines/level2/xher.h b/include/internal/routines/level2/xher.h
index d32d337e..d66b2603 100644
--- a/include/internal/routines/level2/xher.h
+++ b/include/internal/routines/level2/xher.h
@@ -28,10 +28,10 @@ class Xher: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xher(Queue &queue, EventPointer event, const std::string &name = "HER");
diff --git a/include/internal/routines/level2/xher2.h b/include/internal/routines/level2/xher2.h
index 956ffe6a..35bf8190 100644
--- a/include/internal/routines/level2/xher2.h
+++ b/include/internal/routines/level2/xher2.h
@@ -28,10 +28,10 @@ class Xher2: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Constructor
Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h
index 3b358080..c9107c25 100644
--- a/include/internal/routines/level2/xtbmv.h
+++ b/include/internal/routines/level2/xtbmv.h
@@ -28,6 +28,7 @@ class Xtbmv: public Xgemv<T> {
// Members from the base class
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::context_;
// Uses the generic matrix-vector routine
diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h
index f306cf4a..e85c225f 100644
--- a/include/internal/routines/level2/xtpmv.h
+++ b/include/internal/routines/level2/xtpmv.h
@@ -28,6 +28,7 @@ class Xtpmv: public Xgemv<T> {
// Members from the base class
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::context_;
// Uses the generic matrix-vector routine
diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h
index cf0824a4..97a180ff 100644
--- a/include/internal/routines/level2/xtrmv.h
+++ b/include/internal/routines/level2/xtrmv.h
@@ -28,6 +28,7 @@ class Xtrmv: public Xgemv<T> {
// Members from the base class
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::context_;
// Uses the generic matrix-vector routine
diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h
index 8facaa76..2fd853a9 100644
--- a/include/internal/routines/level3/xgemm.h
+++ b/include/internal/routines/level3/xgemm.h
@@ -28,11 +28,10 @@ class Xgemm: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::PadCopyTransposeMatrix;
- using Routine<T>::RunKernel;
// Constructor
Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");
diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h
index cf833f57..8bd38393 100644
--- a/include/internal/routines/level3/xhemm.h
+++ b/include/internal/routines/level3/xhemm.h
@@ -27,9 +27,10 @@ class Xhemm: public Xgemm<T> {
// Members and methods from the base class
using Routine<T>::db_;
+ using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Uses the regular Xgemm routine
using Xgemm<T>::DoGemm;
diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h
index cdba33ab..1afe87a6 100644
--- a/include/internal/routines/level3/xher2k.h
+++ b/include/internal/routines/level3/xher2k.h
@@ -30,11 +30,10 @@ class Xher2k: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::PadCopyTransposeMatrix;
- using Routine<T>::RunKernel;
// Constructor
Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");
diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h
index a9d1615a..64abae3b 100644
--- a/include/internal/routines/level3/xherk.h
+++ b/include/internal/routines/level3/xherk.h
@@ -30,11 +30,10 @@ class Xherk: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::PadCopyTransposeMatrix;
- using Routine<T>::RunKernel;
// Constructor
Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");
diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h
index ec145a89..c35dfb5e 100644
--- a/include/internal/routines/level3/xsymm.h
+++ b/include/internal/routines/level3/xsymm.h
@@ -29,9 +29,10 @@ class Xsymm: public Xgemm<T> {
// Members and methods from the base class
using Routine<T>::db_;
+ using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Uses the regular Xgemm routine
using Xgemm<T>::DoGemm;
diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h
index 1093d635..73d11b0b 100644
--- a/include/internal/routines/level3/xsyr2k.h
+++ b/include/internal/routines/level3/xsyr2k.h
@@ -30,11 +30,10 @@ class Xsyr2k: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::PadCopyTransposeMatrix;
- using Routine<T>::RunKernel;
// Constructor
Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");
diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h
index c2edbf2b..344c02e2 100644
--- a/include/internal/routines/level3/xsyrk.h
+++ b/include/internal/routines/level3/xsyrk.h
@@ -32,11 +32,10 @@ class Xsyrk: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::PadCopyTransposeMatrix;
- using Routine<T>::RunKernel;
// Constructor
Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");
diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h
index 6cb46b01..5c12815d 100644
--- a/include/internal/routines/level3/xtrmm.h
+++ b/include/internal/routines/level3/xtrmm.h
@@ -28,9 +28,10 @@ class Xtrmm: public Xgemm<T> {
// Members and methods from the base class
using Routine<T>::db_;
+ using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::RunKernel;
// Uses the regular Xgemm routine
using Xgemm<T>::DoGemm;
diff --git a/include/internal/routines/levelx/xomatcopy.h b/include/internal/routines/levelx/xomatcopy.h
index ee38fa15..7c284635 100644
--- a/include/internal/routines/levelx/xomatcopy.h
+++ b/include/internal/routines/levelx/xomatcopy.h
@@ -25,10 +25,13 @@ class Xomatcopy: public Routine<T> {
public:
// Members and methods from the base class
+ using Routine<T>::db_;
using Routine<T>::source_string_;
+ using Routine<T>::queue_;
+ using Routine<T>::device_;
using Routine<T>::event_;
+ using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
- using Routine<T>::PadCopyTransposeMatrix;
// Constructor
Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY");