summaryrefslogtreecommitdiff
path: root/src/routines
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-06-17 12:32:06 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-06-17 12:32:06 +0200
commit98a95c89fc0633efdc8439c942762bef9a1e5e1d (patch)
treed37775c4bf07229f7eae38c335da85eaf0c468a9 /src/routines
parent520e28e7a72f288f04d04d86d4e7560d78159820 (diff)
Moved the RunKernel and PadCopyTransposeMatrix functions out of the Routine class
Diffstat (limited to 'src/routines')
-rw-r--r--src/routines/level1/xamax.cc4
-rw-r--r--src/routines/level1/xasum.cc4
-rw-r--r--src/routines/level1/xaxpy.cc4
-rw-r--r--src/routines/level1/xcopy.cc4
-rw-r--r--src/routines/level1/xdot.cc4
-rw-r--r--src/routines/level1/xnrm2.cc4
-rw-r--r--src/routines/level1/xscal.cc4
-rw-r--r--src/routines/level1/xswap.cc4
-rw-r--r--src/routines/level2/xgemv.cc2
-rw-r--r--src/routines/level2/xger.cc2
-rw-r--r--src/routines/level2/xher.cc2
-rw-r--r--src/routines/level2/xher2.cc2
-rw-r--r--src/routines/level3/xgemm.cc10
-rw-r--r--src/routines/level3/xhemm.cc2
-rw-r--r--src/routines/level3/xher2k.cc16
-rw-r--r--src/routines/level3/xherk.cc10
-rw-r--r--src/routines/level3/xsymm.cc2
-rw-r--r--src/routines/level3/xsyr2k.cc12
-rw-r--r--src/routines/level3/xsyrk.cc8
-rw-r--r--src/routines/level3/xtrmm.cc2
-rw-r--r--src/routines/levelx/xomatcopy.cc2
21 files changed, 52 insertions, 52 deletions
diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc
index 9a7d2173..6028d953 100644
--- a/src/routines/level1/xamax.cc
+++ b/src/routines/level1/xamax.cc
@@ -80,7 +80,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
- status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+ status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
@@ -93,7 +93,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+ status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc
index 3dcaa80a..6046a467 100644
--- a/src/routines/level1/xasum.cc
+++ b/src/routines/level1/xasum.cc
@@ -78,7 +78,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
- status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+ status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
@@ -90,7 +90,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+ status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc
index b57001f9..dbc05cf7 100644
--- a/src/routines/level1/xaxpy.cc
+++ b/src/routines/level1/xaxpy.cc
@@ -94,13 +94,13 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc
index 273e87a6..8848201c 100644
--- a/src/routines/level1/xcopy.cc
+++ b/src/routines/level1/xcopy.cc
@@ -88,13 +88,13 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc
index 25eccadf..a819564a 100644
--- a/src/routines/level1/xdot.cc
+++ b/src/routines/level1/xdot.cc
@@ -86,7 +86,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
- status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+ status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
@@ -98,7 +98,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+ status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc
index eca283f8..8904c369 100644
--- a/src/routines/level1/xnrm2.cc
+++ b/src/routines/level1/xnrm2.cc
@@ -78,7 +78,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
- status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+ status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
@@ -90,7 +90,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+ status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc
index 0ce211b6..8078c076 100644
--- a/src/routines/level1/xscal.cc
+++ b/src/routines/level1/xscal.cc
@@ -82,13 +82,13 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc
index 773d78b5..01184db5 100644
--- a/src/routines/level1/xswap.cc
+++ b/src/routines/level1/xswap.cc
@@ -88,13 +88,13 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
index 18e61f28..07c6ec9d 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -169,7 +169,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
// Launches the kernel
auto global = std::vector<size_t>{global_size};
auto local = std::vector<size_t>{local_size};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc
index 7d6fb091..c69efc23 100644
--- a/src/routines/level2/xger.cc
+++ b/src/routines/level2/xger.cc
@@ -94,7 +94,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc
index 3d5c0baf..ed8763dc 100644
--- a/src/routines/level2/xher.cc
+++ b/src/routines/level2/xher.cc
@@ -105,7 +105,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc
index a7362410..10b98329 100644
--- a/src/routines/level2/xher2.cc
+++ b/src/routines/level2/xher2.cc
@@ -96,7 +96,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, global, local, event_);
+ status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index 713bed8f..eced53ab 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -142,7 +142,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
ConstantOne<T>(), program,
@@ -154,7 +154,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// As above, but now for matrix B
if (!b_no_temp) {
auto eventProcessB = Event();
- status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
ConstantOne<T>(), program,
@@ -166,7 +166,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// As above, but now for matrix C. This is only necessary if C is used both as input and output.
if (!c_no_temp && beta != static_cast<T>(0)) {
auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
ConstantOne<T>(), program,
@@ -199,13 +199,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// Launches the kernel
auto eventKernel = Event();
auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
- status = RunKernel(kernel, global, local, eventPointer, eventWaitList);
+ status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
if (ErrorIn(status)) { return status; }
// Runs the post-processing kernel if needed
if (!c_no_temp) {
eventWaitList.push_back(eventKernel);
- status = PadCopyTransposeMatrix(event_, eventWaitList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc
index a6e853e9..9791d7b4 100644
--- a/src/routines/level3/xhemm.cc
+++ b/src/routines/level3/xhemm.cc
@@ -80,7 +80,7 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
auto kernelEvent = Event();
- status = RunKernel(kernel, global, local, kernelEvent.pointer());
+ status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
// Synchronize now: 'DoGemm' does not accept a list of events to wait for
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
index c891c202..43f7bb76 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@@ -129,7 +129,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// case nothing has to be done, these kernels can be skipped.
if (!a1_no_temp) {
auto eventProcessA1 = Event();
- status = PadCopyTransposeMatrix(eventProcessA1.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA1.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
ConstantOne<T>(), program,
@@ -139,7 +139,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
}
if (!a2_no_temp) {
auto eventProcessA2 = Event();
- status = PadCopyTransposeMatrix(eventProcessA2.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA2.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
ConstantOne<T>(), program,
@@ -149,7 +149,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
}
if (!b1_no_temp) {
auto eventProcessB1 = Event();
- status = PadCopyTransposeMatrix(eventProcessB1.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB1.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
ConstantOne<T>(), program,
@@ -159,7 +159,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
}
if (!b2_no_temp) {
auto eventProcessB2 = Event();
- status = PadCopyTransposeMatrix(eventProcessB2.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB2.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
ConstantOne<T>(), program,
@@ -171,7 +171,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
@@ -201,7 +201,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// Launches the kernel
auto eventKernel1 = Event();
- status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList);
+ status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel1);
@@ -217,14 +217,14 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// Runs the kernel again
auto eventKernel2 = Event();
- status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList);
+ status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel2);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(event_, eventWaitList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
index 9d64af95..8ebcbfa8 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@@ -121,7 +121,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// case nothing has to be done, these kernels can be skipped. Two copies are created.
if (!a_no_temp) {
auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
@@ -131,7 +131,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
}
if (!b_no_temp) {
auto eventProcessB = Event();
- status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
ConstantOne<T>(), program,
@@ -143,7 +143,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
@@ -173,14 +173,14 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// Launches the kernel
auto eventKernel = Event();
- status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList);
+ status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(event_, eventWaitList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc
index 379e2908..650afbfc 100644
--- a/src/routines/level3/xsymm.cc
+++ b/src/routines/level3/xsymm.cc
@@ -80,7 +80,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
auto kernelEvent = Event();
- status = RunKernel(kernel, global, local, kernelEvent.pointer());
+ status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
// Synchronize now: 'DoGemm' does not accept a list of events to wait for
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
index 886d1e16..4b436381 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@@ -121,7 +121,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
@@ -131,7 +131,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
}
if (!b_no_temp) {
auto eventProcessB = Event();
- status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
ConstantOne<T>(), program,
@@ -143,7 +143,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
@@ -173,7 +173,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Launches the kernel
auto eventKernel1 = Event();
- status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList);
+ status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel1);
@@ -186,14 +186,14 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Runs the kernel again
auto eventKernel2 = Event();
- status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList);
+ status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel2);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(event_, eventWaitList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
index 000347f3..5c49795b 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@@ -114,7 +114,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
- status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
@@ -126,7 +126,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
- status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
@@ -156,14 +156,14 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// Launches the kernel
auto eventKernel = Event();
- status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList);
+ status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(event_, eventWaitList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc
index c62305aa..98e0622d 100644
--- a/src/routines/level3/xtrmm.cc
+++ b/src/routines/level3/xtrmm.cc
@@ -83,7 +83,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
auto kernelEvent = Event();
- status = RunKernel(kernel, global, local, kernelEvent.pointer());
+ status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
// Synchronize now: 'DoGemm' does not accept a list of events to wait for
diff --git a/src/routines/levelx/xomatcopy.cc b/src/routines/levelx/xomatcopy.cc
index dcc4e52a..199a4903 100644
--- a/src/routines/levelx/xomatcopy.cc
+++ b/src/routines/levelx/xomatcopy.cc
@@ -81,7 +81,7 @@ StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_trans
const auto program = GetProgramFromCache();
auto emptyEventList = std::vector<Event>();
- status = PadCopyTransposeMatrix(event_, emptyEventList,
+ status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
b_one, b_two, b_ld, b_offset, b_buffer,
alpha, program, false, transpose, conjugate);