summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md1
-rw-r--r--include/internal/clpp11.h55
-rw-r--r--include/internal/routine.h14
-rw-r--r--include/internal/routines/level1/xaxpy.h3
-rw-r--r--include/internal/routines/level1/xcopy.h3
-rw-r--r--include/internal/routines/level1/xdot.h3
-rw-r--r--include/internal/routines/level1/xdotc.h2
-rw-r--r--include/internal/routines/level1/xdotu.h2
-rw-r--r--include/internal/routines/level1/xnrm2.h3
-rw-r--r--include/internal/routines/level1/xscal.h3
-rw-r--r--include/internal/routines/level1/xswap.h3
-rw-r--r--include/internal/routines/level2/xgbmv.h2
-rw-r--r--include/internal/routines/level2/xgemv.h3
-rw-r--r--include/internal/routines/level2/xger.h3
-rw-r--r--include/internal/routines/level2/xgerc.h2
-rw-r--r--include/internal/routines/level2/xgeru.h2
-rw-r--r--include/internal/routines/level2/xhbmv.h2
-rw-r--r--include/internal/routines/level2/xhemv.h2
-rw-r--r--include/internal/routines/level2/xher.h3
-rw-r--r--include/internal/routines/level2/xher2.h3
-rw-r--r--include/internal/routines/level2/xhpmv.h2
-rw-r--r--include/internal/routines/level2/xhpr.h2
-rw-r--r--include/internal/routines/level2/xhpr2.h2
-rw-r--r--include/internal/routines/level2/xsbmv.h2
-rw-r--r--include/internal/routines/level2/xspmv.h2
-rw-r--r--include/internal/routines/level2/xspr.h2
-rw-r--r--include/internal/routines/level2/xspr2.h2
-rw-r--r--include/internal/routines/level2/xsymv.h2
-rw-r--r--include/internal/routines/level2/xsyr.h2
-rw-r--r--include/internal/routines/level2/xsyr2.h2
-rw-r--r--include/internal/routines/level2/xtbmv.h2
-rw-r--r--include/internal/routines/level2/xtpmv.h2
-rw-r--r--include/internal/routines/level2/xtrmv.h2
-rw-r--r--include/internal/routines/level3/xgemm.h3
-rw-r--r--include/internal/routines/level3/xhemm.h2
-rw-r--r--include/internal/routines/level3/xher2k.h3
-rw-r--r--include/internal/routines/level3/xherk.h3
-rw-r--r--include/internal/routines/level3/xsymm.h2
-rw-r--r--include/internal/routines/level3/xsyr2k.h3
-rw-r--r--include/internal/routines/level3/xsyrk.h3
-rw-r--r--include/internal/routines/level3/xtrmm.h2
-rw-r--r--samples/sgemm.cc7
-rw-r--r--scripts/generator/generator.py3
-rw-r--r--src/clblast.cc114
-rw-r--r--src/routine.cc31
-rw-r--r--src/routines/level1/xaxpy.cc6
-rw-r--r--src/routines/level1/xcopy.cc6
-rw-r--r--src/routines/level1/xdot.cc11
-rw-r--r--src/routines/level1/xdotc.cc2
-rw-r--r--src/routines/level1/xdotu.cc2
-rw-r--r--src/routines/level1/xnrm2.cc10
-rw-r--r--src/routines/level1/xscal.cc6
-rw-r--r--src/routines/level1/xswap.cc6
-rw-r--r--src/routines/level2/xgbmv.cc2
-rw-r--r--src/routines/level2/xgemv.cc4
-rw-r--r--src/routines/level2/xger.cc4
-rw-r--r--src/routines/level2/xgerc.cc2
-rw-r--r--src/routines/level2/xgeru.cc2
-rw-r--r--src/routines/level2/xhbmv.cc2
-rw-r--r--src/routines/level2/xhemv.cc2
-rw-r--r--src/routines/level2/xher.cc4
-rw-r--r--src/routines/level2/xher2.cc4
-rw-r--r--src/routines/level2/xhpmv.cc2
-rw-r--r--src/routines/level2/xhpr.cc2
-rw-r--r--src/routines/level2/xhpr2.cc2
-rw-r--r--src/routines/level2/xsbmv.cc2
-rw-r--r--src/routines/level2/xspmv.cc2
-rw-r--r--src/routines/level2/xspr.cc2
-rw-r--r--src/routines/level2/xspr2.cc2
-rw-r--r--src/routines/level2/xsymv.cc2
-rw-r--r--src/routines/level2/xsyr.cc2
-rw-r--r--src/routines/level2/xsyr2.cc2
-rw-r--r--src/routines/level2/xtbmv.cc2
-rw-r--r--src/routines/level2/xtpmv.cc2
-rw-r--r--src/routines/level2/xtrmv.cc2
-rw-r--r--src/routines/level3/xgemm.cc28
-rw-r--r--src/routines/level3/xhemm.cc8
-rw-r--r--src/routines/level3/xher2k.cc42
-rw-r--r--src/routines/level3/xherk.cc28
-rw-r--r--src/routines/level3/xsymm.cc8
-rw-r--r--src/routines/level3/xsyr2k.cc32
-rw-r--r--src/routines/level3/xsyrk.cc24
-rw-r--r--src/routines/level3/xtrmm.cc8
83 files changed, 360 insertions, 235 deletions
diff --git a/README.md b/README.md
index d69ad552..74d8c9cc 100644
--- a/README.md
+++ b/README.md
@@ -255,5 +255,4 @@ To-do list before release of version 1.0
-------------
- Support all routines supported by clBLAS
-- Allow the user control over events and synchronization
- Add half-precision routines (e.g. HGEMM)
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
index 00905ef7..543d423a 100644
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@@ -73,29 +73,41 @@ class Event {
public:
// Constructor based on the regular OpenCL data-type
- explicit Event(cl_event* event): event_(event) { }
+ explicit Event(const cl_event event): event_(event) { }
+
+ // Regular constructor
+ explicit Event(): event_(nullptr) { }
+
+ // Waits for completion of this event
+ void WaitForCompletion() const {
+ CheckError(clWaitForEvents(1, &event_));
+ }
// Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
// the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
// http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
float GetElapsedTime() const {
- CheckError(clWaitForEvents(1, event_));
+ WaitForCompletion();
auto bytes = size_t{0};
- clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
auto time_start = size_t{0};
- clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
- clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
auto time_end = size_t{0};
- clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
return (time_end - time_start) * 1.0e-6f;
}
// Accessor to the private data-member
- cl_event& operator()() { return *event_; }
+ cl_event& operator()() { return event_; }
+ cl_event* pointer() { return &event_; }
private:
- cl_event* event_;
+ cl_event event_;
};
+// Pointer to an OpenCL event
+using EventPointer = cl_event*;
+
// =================================================================================================
// C++11 version of 'cl_platform_id'
@@ -600,17 +612,36 @@ class Kernel {
// Launches a kernel onto the specified queue
void Launch(const Queue &queue, const std::vector<size_t> &global,
- const std::vector<size_t> &local, Event &event) {
+ const std::vector<size_t> &local, EventPointer event) {
+ CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
+ nullptr, global.data(), local.data(),
+ 0, nullptr, event));
+ }
+
+ // As above, but with an event waiting list
+ void Launch(const Queue &queue, const std::vector<size_t> &global,
+ const std::vector<size_t> &local, EventPointer event,
+ std::vector<Event>& waitForEvents) {
+ if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
+
+ // Builds a plain version of the events waiting list
+ auto waitForEventsPlain = std::vector<cl_event>();
+ for (auto &waitEvent : waitForEvents) {
+ waitForEventsPlain.push_back(waitEvent());
+ }
+
+ // Launches the kernel while waiting for other events
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
nullptr, global.data(), local.data(),
- 0, nullptr, &(event())));
+ waitForEventsPlain.size(), waitForEventsPlain.data(),
+ event));
}
// As above, but with the default local workgroup size
- void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
+ void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
nullptr, global.data(), nullptr,
- 0, nullptr, &(event())));
+ 0, nullptr, event));
}
// Accessor to the private data-member
diff --git a/include/internal/routine.h b/include/internal/routine.h
index 5f5b8211..b2b6f622 100644
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@@ -55,7 +55,7 @@ class Routine {
static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
// Base class constructor
- explicit Routine(Queue &queue, Event &event, const std::string &name,
+ explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision);
// Set-up phase of the kernel
@@ -65,7 +65,12 @@ class Routine {
// Runs a kernel given the global and local thread sizes
StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global,
- const std::vector<size_t> &local);
+ const std::vector<size_t> &local, EventPointer event,
+ std::vector<Event>& waitForEvents);
+
+ // As above, but without an event waiting list
+ StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global,
+ const std::vector<size_t> &local, EventPointer event);
// Tests for valid inputs of matrices A, B, and C
StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
@@ -87,7 +92,8 @@ class Routine {
// Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
// to symmetric and triangular matrices through optional arguments.
- StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
+ StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
+ const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
@@ -114,7 +120,7 @@ class Routine {
// The OpenCL objects, accessible only from derived classes
Queue queue_;
- Event event_;
+ EventPointer event_;
const Context context_;
const Device device_;
diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h
index 689cf169..bc00c8e3 100644
--- a/include/internal/routines/level1/xaxpy.h
+++ b/include/internal/routines/level1/xaxpy.h
@@ -28,6 +28,7 @@ class Xaxpy: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@@ -35,7 +36,7 @@ class Xaxpy: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xaxpy(Queue &queue, Event &event, const std::string &name = "AXPY");
+ Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
// Templated-precision implementation of the routine
StatusCode DoAxpy(const size_t n, const T alpha,
diff --git a/include/internal/routines/level1/xcopy.h b/include/internal/routines/level1/xcopy.h
index 15f339aa..5786cb0f 100644
--- a/include/internal/routines/level1/xcopy.h
+++ b/include/internal/routines/level1/xcopy.h
@@ -28,6 +28,7 @@ class Xcopy: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@@ -35,7 +36,7 @@ class Xcopy: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xcopy(Queue &queue, Event &event, const std::string &name = "COPY");
+ Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
// Templated-precision implementation of the routine
StatusCode DoCopy(const size_t n,
diff --git a/include/internal/routines/level1/xdot.h b/include/internal/routines/level1/xdot.h
index 64b62945..95a7ad07 100644
--- a/include/internal/routines/level1/xdot.h
+++ b/include/internal/routines/level1/xdot.h
@@ -28,6 +28,7 @@ class Xdot: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
@@ -37,7 +38,7 @@ class Xdot: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xdot(Queue &queue, Event &event, const std::string &name = "DOT");
+ Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
// Templated-precision implementation of the routine
StatusCode DoDot(const size_t n,
diff --git a/include/internal/routines/level1/xdotc.h b/include/internal/routines/level1/xdotc.h
index 726cec7c..0dc2cfe9 100644
--- a/include/internal/routines/level1/xdotc.h
+++ b/include/internal/routines/level1/xdotc.h
@@ -28,7 +28,7 @@ class Xdotc: public Xdot<T> {
using Xdot<T>::DoDot;
// Constructor
- Xdotc(Queue &queue, Event &event, const std::string &name = "DOTC");
+ Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");
// Templated-precision implementation of the routine
StatusCode DoDotc(const size_t n,
diff --git a/include/internal/routines/level1/xdotu.h b/include/internal/routines/level1/xdotu.h
index 825ebb78..98988744 100644
--- a/include/internal/routines/level1/xdotu.h
+++ b/include/internal/routines/level1/xdotu.h
@@ -28,7 +28,7 @@ class Xdotu: public Xdot<T> {
using Xdot<T>::DoDot;
// Constructor
- Xdotu(Queue &queue, Event &event, const std::string &name = "DOTU");
+ Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");
// Templated-precision implementation of the routine
StatusCode DoDotu(const size_t n,
diff --git a/include/internal/routines/level1/xnrm2.h b/include/internal/routines/level1/xnrm2.h
index b3fffef6..6f6ca74f 100644
--- a/include/internal/routines/level1/xnrm2.h
+++ b/include/internal/routines/level1/xnrm2.h
@@ -28,6 +28,7 @@ class Xnrm2: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
@@ -36,7 +37,7 @@ class Xnrm2: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xnrm2(Queue &queue, Event &event, const std::string &name = "NRM2");
+ Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
// Templated-precision implementation of the routine
StatusCode DoNrm2(const size_t n,
diff --git a/include/internal/routines/level1/xscal.h b/include/internal/routines/level1/xscal.h
index d97b5a07..e10a201d 100644
--- a/include/internal/routines/level1/xscal.h
+++ b/include/internal/routines/level1/xscal.h
@@ -28,13 +28,14 @@ class Xscal: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
- Xscal(Queue &queue, Event &event, const std::string &name = "SCAL");
+ Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
// Templated-precision implementation of the routine
StatusCode DoScal(const size_t n, const T alpha,
diff --git a/include/internal/routines/level1/xswap.h b/include/internal/routines/level1/xswap.h
index fe79882b..0f240763 100644
--- a/include/internal/routines/level1/xswap.h
+++ b/include/internal/routines/level1/xswap.h
@@ -28,6 +28,7 @@ class Xswap: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@@ -35,7 +36,7 @@ class Xswap: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xswap(Queue &queue, Event &event, const std::string &name = "SWAP");
+ Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
// Templated-precision implementation of the routine
StatusCode DoSwap(const size_t n,
diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h
index 27b033e9..bc94c77d 100644
--- a/include/internal/routines/level2/xgbmv.h
+++ b/include/internal/routines/level2/xgbmv.h
@@ -30,7 +30,7 @@ class Xgbmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
- Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV");
+ Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");
// Templated-precision implementation of the routine
StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h
index b31565ec..0b2a8e66 100644
--- a/include/internal/routines/level2/xgemv.h
+++ b/include/internal/routines/level2/xgemv.h
@@ -28,6 +28,7 @@ class Xgemv: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@@ -37,7 +38,7 @@ class Xgemv: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xgemv(Queue &queue, Event &event, const std::string &name = "GEMV");
+ Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
// Templated-precision implementation of the routine
StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
diff --git a/include/internal/routines/level2/xger.h b/include/internal/routines/level2/xger.h
index 45ecea10..5ace9da6 100644
--- a/include/internal/routines/level2/xger.h
+++ b/include/internal/routines/level2/xger.h
@@ -28,6 +28,7 @@ class Xger: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@@ -36,7 +37,7 @@ class Xger: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xger(Queue &queue, Event &event, const std::string &name = "GER");
+ Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
// Templated-precision implementation of the routine
StatusCode DoGer(const Layout layout,
diff --git a/include/internal/routines/level2/xgerc.h b/include/internal/routines/level2/xgerc.h
index 8e515a14..6d06ef94 100644
--- a/include/internal/routines/level2/xgerc.h
+++ b/include/internal/routines/level2/xgerc.h
@@ -28,7 +28,7 @@ class Xgerc: public Xger<T> {
using Xger<T>::DoGer;
// Constructor
- Xgerc(Queue &queue, Event &event, const std::string &name = "GERC");
+ Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");
// Templated-precision implementation of the routine
StatusCode DoGerc(const Layout layout,
diff --git a/include/internal/routines/level2/xgeru.h b/include/internal/routines/level2/xgeru.h
index ec485c37..45ce1cba 100644
--- a/include/internal/routines/level2/xgeru.h
+++ b/include/internal/routines/level2/xgeru.h
@@ -28,7 +28,7 @@ class Xgeru: public Xger<T> {
using Xger<T>::DoGer;
// Constructor
- Xgeru(Queue &queue, Event &event, const std::string &name = "GERU");
+ Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");
// Templated-precision implementation of the routine
StatusCode DoGeru(const Layout layout,
diff --git a/include/internal/routines/level2/xhbmv.h b/include/internal/routines/level2/xhbmv.h
index 65138424..f0a6212c 100644
--- a/include/internal/routines/level2/xhbmv.h
+++ b/include/internal/routines/level2/xhbmv.h
@@ -30,7 +30,7 @@ class Xhbmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
- Xhbmv(Queue &queue, Event &event, const std::string &name = "HBMV");
+ Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");
// Templated-precision implementation of the routine
StatusCode DoHbmv(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h
index b74db760..3daf2457 100644
--- a/include/internal/routines/level2/xhemv.h
+++ b/include/internal/routines/level2/xhemv.h
@@ -30,7 +30,7 @@ class Xhemv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
- Xhemv(Queue &queue, Event &event, const std::string &name = "HEMV");
+ Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");
// Templated-precision implementation of the routine
StatusCode DoHemv(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xher.h b/include/internal/routines/level2/xher.h
index 6322265b..861ba302 100644
--- a/include/internal/routines/level2/xher.h
+++ b/include/internal/routines/level2/xher.h
@@ -28,6 +28,7 @@ class Xher: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestMatrixA;
@@ -36,7 +37,7 @@ class Xher: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xher(Queue &queue, Event &event, const std::string &name = "HER");
+ Xher(Queue &queue, EventPointer event, const std::string &name = "HER");
// Translates alpha of type 'U' into type 'T'
T GetAlpha(const U alpha);
diff --git a/include/internal/routines/level2/xher2.h b/include/internal/routines/level2/xher2.h
index 26f69046..9a23199e 100644
--- a/include/internal/routines/level2/xher2.h
+++ b/include/internal/routines/level2/xher2.h
@@ -28,6 +28,7 @@ class Xher2: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@@ -37,7 +38,7 @@ class Xher2: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xher2(Queue &queue, Event &event, const std::string &name = "HER2");
+ Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
// Templated-precision implementation of the routine
StatusCode DoHer2(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xhpmv.h b/include/internal/routines/level2/xhpmv.h
index 48f1ed3f..a1d5595a 100644
--- a/include/internal/routines/level2/xhpmv.h
+++ b/include/internal/routines/level2/xhpmv.h
@@ -30,7 +30,7 @@ class Xhpmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
- Xhpmv(Queue &queue, Event &event, const std::string &name = "HPMV");
+ Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");
// Templated-precision implementation of the routine
StatusCode DoHpmv(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xhpr.h b/include/internal/routines/level2/xhpr.h
index a0c3cb92..6554d74c 100644
--- a/include/internal/routines/level2/xhpr.h
+++ b/include/internal/routines/level2/xhpr.h
@@ -28,7 +28,7 @@ class Xhpr: public Xher<T,U> {
using Xher<T,U>::DoHer;
// Constructor
- Xhpr(Queue &queue, Event &event, const std::string &name = "HPR");
+ Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");
// Templated-precision implementation of the routine
StatusCode DoHpr(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xhpr2.h b/include/internal/routines/level2/xhpr2.h
index fd243d33..d95e7b61 100644
--- a/include/internal/routines/level2/xhpr2.h
+++ b/include/internal/routines/level2/xhpr2.h
@@ -28,7 +28,7 @@ class Xhpr2: public Xher2<T> {
using Xher2<T>::DoHer2;
// Constructor
- Xhpr2(Queue &queue, Event &event, const std::string &name = "HPR2");
+ Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");
// Templated-precision implementation of the routine
StatusCode DoHpr2(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xsbmv.h b/include/internal/routines/level2/xsbmv.h
index bb24d8f4..4328e377 100644
--- a/include/internal/routines/level2/xsbmv.h
+++ b/include/internal/routines/level2/xsbmv.h
@@ -30,7 +30,7 @@ class Xsbmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
- Xsbmv(Queue &queue, Event &event, const std::string &name = "SBMV");
+ Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");
// Templated-precision implementation of the routine
StatusCode DoSbmv(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xspmv.h b/include/internal/routines/level2/xspmv.h
index 88f02a2f..ca3e28b6 100644
--- a/include/internal/routines/level2/xspmv.h
+++ b/include/internal/routines/level2/xspmv.h
@@ -30,7 +30,7 @@ class Xspmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
- Xspmv(Queue &queue, Event &event, const std::string &name = "SPMV");
+ Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");
// Templated-precision implementation of the routine
StatusCode DoSpmv(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xspr.h b/include/internal/routines/level2/xspr.h
index 5b01d2cb..7e91abc5 100644
--- a/include/internal/routines/level2/xspr.h
+++ b/include/internal/routines/level2/xspr.h
@@ -28,7 +28,7 @@ class Xspr: public Xher<T,T> {
using Xher<T,T>::DoHer;
// Constructor
- Xspr(Queue &queue, Event &event, const std::string &name = "SPR");
+ Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");
// Templated-precision implementation of the routine
StatusCode DoSpr(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xspr2.h b/include/internal/routines/level2/xspr2.h
index 3d5f4992..a34be8e8 100644
--- a/include/internal/routines/level2/xspr2.h
+++ b/include/internal/routines/level2/xspr2.h
@@ -28,7 +28,7 @@ class Xspr2: public Xher2<T> {
using Xher2<T>::DoHer2;
// Constructor
- Xspr2(Queue &queue, Event &event, const std::string &name = "SPR2");
+ Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");
// Templated-precision implementation of the routine
StatusCode DoSpr2(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xsymv.h b/include/internal/routines/level2/xsymv.h
index c7b92702..98a0ce88 100644
--- a/include/internal/routines/level2/xsymv.h
+++ b/include/internal/routines/level2/xsymv.h
@@ -30,7 +30,7 @@ class Xsymv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
- Xsymv(Queue &queue, Event &event, const std::string &name = "SYMV");
+ Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");
// Templated-precision implementation of the routine
StatusCode DoSymv(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xsyr.h b/include/internal/routines/level2/xsyr.h
index 9704a881..f88498ae 100644
--- a/include/internal/routines/level2/xsyr.h
+++ b/include/internal/routines/level2/xsyr.h
@@ -28,7 +28,7 @@ class Xsyr: public Xher<T,T> {
using Xher<T,T>::DoHer;
// Constructor
- Xsyr(Queue &queue, Event &event, const std::string &name = "SYR");
+ Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");
// Templated-precision implementation of the routine
StatusCode DoSyr(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xsyr2.h b/include/internal/routines/level2/xsyr2.h
index f4dc9375..d2d3143a 100644
--- a/include/internal/routines/level2/xsyr2.h
+++ b/include/internal/routines/level2/xsyr2.h
@@ -28,7 +28,7 @@ class Xsyr2: public Xher2<T> {
using Xher2<T>::DoHer2;
// Constructor
- Xsyr2(Queue &queue, Event &event, const std::string &name = "SYR2");
+ Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");
// Templated-precision implementation of the routine
StatusCode DoSyr2(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h
index 89c90193..3b358080 100644
--- a/include/internal/routines/level2/xtbmv.h
+++ b/include/internal/routines/level2/xtbmv.h
@@ -34,7 +34,7 @@ class Xtbmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
- Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV");
+ Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");
// Templated-precision implementation of the routine
StatusCode DoTbmv(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h
index 183d3505..f306cf4a 100644
--- a/include/internal/routines/level2/xtpmv.h
+++ b/include/internal/routines/level2/xtpmv.h
@@ -34,7 +34,7 @@ class Xtpmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
- Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV");
+ Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");
// Templated-precision implementation of the routine
StatusCode DoTpmv(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h
index dadfbc98..cf0824a4 100644
--- a/include/internal/routines/level2/xtrmv.h
+++ b/include/internal/routines/level2/xtrmv.h
@@ -34,7 +34,7 @@ class Xtrmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
- Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV");
+ Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");
// Templated-precision implementation of the routine
StatusCode DoTrmv(const Layout layout, const Triangle triangle,
diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h
index 9b40a7fc..85fb0616 100644
--- a/include/internal/routines/level3/xgemm.h
+++ b/include/internal/routines/level3/xgemm.h
@@ -28,6 +28,7 @@ class Xgemm: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::PadCopyTransposeMatrix;
@@ -38,7 +39,7 @@ class Xgemm: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xgemm(Queue &queue, Event &event, const std::string &name = "GEMM");
+ Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");
// Templated-precision implementation of the routine
StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h
index ca38ca08..ec42b569 100644
--- a/include/internal/routines/level3/xhemm.h
+++ b/include/internal/routines/level3/xhemm.h
@@ -37,7 +37,7 @@ class Xhemm: public Xgemm<T> {
using Xgemm<T>::DoGemm;
// Constructor
- Xhemm(Queue &queue, Event &event, const std::string &name = "HEMM");
+ Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM");
// Templated-precision implementation of the routine
StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h
index 7113a172..623afd49 100644
--- a/include/internal/routines/level3/xher2k.h
+++ b/include/internal/routines/level3/xher2k.h
@@ -30,6 +30,7 @@ class Xher2k: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::PadCopyTransposeMatrix;
@@ -40,7 +41,7 @@ class Xher2k: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xher2k(Queue &queue, Event &event, const std::string &name = "HER2K");
+ Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");
// Templated-precision implementation of the routine
StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h
index 47112c2c..629695ff 100644
--- a/include/internal/routines/level3/xherk.h
+++ b/include/internal/routines/level3/xherk.h
@@ -30,6 +30,7 @@ class Xherk: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::PadCopyTransposeMatrix;
@@ -39,7 +40,7 @@ class Xherk: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xherk(Queue &queue, Event &event, const std::string &name = "HERK");
+ Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");
// Templated-precision implementation of the routine
StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h
index 9fc80eb4..16ad6f53 100644
--- a/include/internal/routines/level3/xsymm.h
+++ b/include/internal/routines/level3/xsymm.h
@@ -39,7 +39,7 @@ class Xsymm: public Xgemm<T> {
using Xgemm<T>::DoGemm;
// Constructor
- Xsymm(Queue &queue, Event &event, const std::string &name = "SYMM");
+ Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM");
// Templated-precision implementation of the routine
StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h
index c4679028..88669626 100644
--- a/include/internal/routines/level3/xsyr2k.h
+++ b/include/internal/routines/level3/xsyr2k.h
@@ -30,6 +30,7 @@ class Xsyr2k: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::PadCopyTransposeMatrix;
@@ -40,7 +41,7 @@ class Xsyr2k: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xsyr2k(Queue &queue, Event &event, const std::string &name = "SYR2K");
+ Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");
// Templated-precision implementation of the routine
StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h
index abf6b681..e95c7c1c 100644
--- a/include/internal/routines/level3/xsyrk.h
+++ b/include/internal/routines/level3/xsyrk.h
@@ -32,6 +32,7 @@ class Xsyrk: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
+ using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::PadCopyTransposeMatrix;
@@ -41,7 +42,7 @@ class Xsyrk: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
- Xsyrk(Queue &queue, Event &event, const std::string &name = "SYRK");
+ Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");
// Templated-precision implementation of the routine
StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h
index a1f4d15c..01f6594d 100644
--- a/include/internal/routines/level3/xtrmm.h
+++ b/include/internal/routines/level3/xtrmm.h
@@ -38,7 +38,7 @@ class Xtrmm: public Xgemm<T> {
using Xgemm<T>::DoGemm;
// Constructor
- Xtrmm(Queue &queue, Event &event, const std::string &name = "TRMM");
+ Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM");
// Templated-precision implementation of the routine
StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
diff --git a/samples/sgemm.cc b/samples/sgemm.cc
index 785b051c..78f2dee8 100644
--- a/samples/sgemm.cc
+++ b/samples/sgemm.cc
@@ -61,7 +61,7 @@ int main() {
// Creates the OpenCL context, queue, and an event
auto context = cl::Context({device});
auto queue = cl::CommandQueue(context, device);
- auto event = cl::Event();
+ auto event = cl_event{nullptr};
// Populate host matrices with some example data
auto host_a = std::vector<float>(m*k);
@@ -84,7 +84,6 @@ int main() {
// Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision.
auto queue_plain = queue();
- auto event_plain = event();
auto status = Gemm(clblast::Layout::kRowMajor,
clblast::Transpose::kNo, clblast::Transpose::kNo,
m, n, k,
@@ -93,10 +92,10 @@ int main() {
device_b(), 0, b_ld,
beta,
device_c(), 0, c_ld,
- &queue_plain, &event_plain);
+ &queue_plain, &event);
// Record the execution time
- event.wait();
+ clWaitForEvents(1, &event);
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index bdf6b9d7..8cd35f95 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -169,8 +169,7 @@ def clblast_cc(routines):
if routine.implemented:
result += routine.RoutineHeaderCPP(12, "")+" {\n"
result += " auto queue_cpp = Queue(*queue);\n"
- result += " auto event_cpp = Event(event);\n"
- result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event_cpp);\n"
+ result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event);\n"
result += " auto status = routine.SetUp();\n"
result += " if (status != StatusCode::kSuccess) { return status; }\n"
result += " return routine.Do"+routine.name.capitalize()+"("
diff --git a/src/clblast.cc b/src/clblast.cc
index 75893ee9..4f4b6078 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -161,8 +161,7 @@ StatusCode Swap(const size_t n,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xswap<T>(queue_cpp, event_cpp);
+ auto routine = Xswap<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSwap(n,
@@ -193,8 +192,7 @@ StatusCode Scal(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xscal<T>(queue_cpp, event_cpp);
+ auto routine = Xscal<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoScal(n,
@@ -225,8 +223,7 @@ StatusCode Copy(const size_t n,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xcopy<T>(queue_cpp, event_cpp);
+ auto routine = Xcopy<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoCopy(n,
@@ -258,8 +255,7 @@ StatusCode Axpy(const size_t n,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xaxpy<T>(queue_cpp, event_cpp);
+ auto routine = Xaxpy<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoAxpy(n,
@@ -296,8 +292,7 @@ StatusCode Dot(const size_t n,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xdot<T>(queue_cpp, event_cpp);
+ auto routine = Xdot<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoDot(n,
@@ -324,8 +319,7 @@ StatusCode Dotu(const size_t n,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xdotu<T>(queue_cpp, event_cpp);
+ auto routine = Xdotu<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoDotu(n,
@@ -352,8 +346,7 @@ StatusCode Dotc(const size_t n,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xdotc<T>(queue_cpp, event_cpp);
+ auto routine = Xdotc<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoDotc(n,
@@ -379,8 +372,7 @@ StatusCode Nrm2(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xnrm2<T>(queue_cpp, event_cpp);
+ auto routine = Xnrm2<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoNrm2(n,
@@ -419,8 +411,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xgemv<T>(queue_cpp, event_cpp);
+ auto routine = Xgemv<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoGemv(layout, a_transpose,
@@ -475,8 +466,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xgbmv<T>(queue_cpp, event_cpp);
+ auto routine = Xgbmv<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoGbmv(layout, a_transpose,
@@ -531,8 +521,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xhemv<T>(queue_cpp, event_cpp);
+ auto routine = Xhemv<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHemv(layout, triangle,
@@ -571,8 +560,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xhbmv<T>(queue_cpp, event_cpp);
+ auto routine = Xhbmv<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHbmv(layout, triangle,
@@ -611,8 +599,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xhpmv<T>(queue_cpp, event_cpp);
+ auto routine = Xhpmv<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHpmv(layout, triangle,
@@ -651,8 +638,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xsymv<T>(queue_cpp, event_cpp);
+ auto routine = Xsymv<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSymv(layout, triangle,
@@ -691,8 +677,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xsbmv<T>(queue_cpp, event_cpp);
+ auto routine = Xsbmv<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSbmv(layout, triangle,
@@ -731,8 +716,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xspmv<T>(queue_cpp, event_cpp);
+ auto routine = Xspmv<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSpmv(layout, triangle,
@@ -768,8 +752,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xtrmv<T>(queue_cpp, event_cpp);
+ auto routine = Xtrmv<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoTrmv(layout, triangle, a_transpose, diagonal,
@@ -806,8 +789,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xtbmv<T>(queue_cpp, event_cpp);
+ auto routine = Xtbmv<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoTbmv(layout, triangle, a_transpose, diagonal,
@@ -844,8 +826,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xtpmv<T>(queue_cpp, event_cpp);
+ auto routine = Xtpmv<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoTpmv(layout, triangle, a_transpose, diagonal,
@@ -974,8 +955,7 @@ StatusCode Ger(const Layout layout,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xger<T>(queue_cpp, event_cpp);
+ auto routine = Xger<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoGer(layout,
@@ -1010,8 +990,7 @@ StatusCode Geru(const Layout layout,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xgeru<T>(queue_cpp, event_cpp);
+ auto routine = Xgeru<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoGeru(layout,
@@ -1046,8 +1025,7 @@ StatusCode Gerc(const Layout layout,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xgerc<T>(queue_cpp, event_cpp);
+ auto routine = Xgerc<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoGerc(layout,
@@ -1081,8 +1059,7 @@ StatusCode Her(const Layout layout, const Triangle triangle,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xher<std::complex<T>,T>(queue_cpp, event_cpp);
+ auto routine = Xher<std::complex<T>,T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHer(layout, triangle,
@@ -1113,8 +1090,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event_cpp);
+ auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHpr(layout, triangle,
@@ -1146,8 +1122,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xher2<T>(queue_cpp, event_cpp);
+ auto routine = Xher2<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHer2(layout, triangle,
@@ -1182,8 +1157,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xhpr2<T>(queue_cpp, event_cpp);
+ auto routine = Xhpr2<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHpr2(layout, triangle,
@@ -1217,8 +1191,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xsyr<T>(queue_cpp, event_cpp);
+ auto routine = Xsyr<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSyr(layout, triangle,
@@ -1249,8 +1222,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xspr<T>(queue_cpp, event_cpp);
+ auto routine = Xspr<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSpr(layout, triangle,
@@ -1282,8 +1254,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xsyr2<T>(queue_cpp, event_cpp);
+ auto routine = Xsyr2<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSyr2(layout, triangle,
@@ -1318,8 +1289,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xspr2<T>(queue_cpp, event_cpp);
+ auto routine = Xspr2<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSpr2(layout, triangle,
@@ -1359,8 +1329,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xgemm<T>(queue_cpp, event_cpp);
+ auto routine = Xgemm<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoGemm(layout, a_transpose, b_transpose,
@@ -1415,8 +1384,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xsymm<T>(queue_cpp, event_cpp);
+ auto routine = Xsymm<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSymm(layout, side, triangle,
@@ -1471,8 +1439,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xhemm<T>(queue_cpp, event_cpp);
+ auto routine = Xhemm<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHemm(layout, side, triangle,
@@ -1510,8 +1477,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xsyrk<T>(queue_cpp, event_cpp);
+ auto routine = Xsyrk<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSyrk(layout, triangle, a_transpose,
@@ -1560,8 +1526,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xherk<std::complex<T>,T>(queue_cpp, event_cpp);
+ auto routine = Xherk<std::complex<T>,T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHerk(layout, triangle, a_transpose,
@@ -1597,8 +1562,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xsyr2k<T>(queue_cpp, event_cpp);
+ auto routine = Xsyr2k<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSyr2k(layout, triangle, ab_transpose,
@@ -1653,8 +1617,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xher2k<T,U>(queue_cpp, event_cpp);
+ auto routine = Xher2k<T,U>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHer2k(layout, triangle, ab_transpose,
@@ -1691,8 +1654,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
- auto event_cpp = Event(event);
- auto routine = Xtrmm<T>(queue_cpp, event_cpp);
+ auto routine = Xtrmm<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal,
diff --git a/src/routine.cc b/src/routine.cc
index ff7b3e1a..b5ba63eb 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -26,7 +26,7 @@ template <typename T> std::mutex Routine<T>::program_cache_mutex_;
// Constructor: not much here, because no status codes can be returned
template <typename T>
-Routine<T>::Routine(Queue &queue, Event &event, const std::string &name,
+Routine<T>::Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision):
precision_(precision),
routine_name_(name),
@@ -117,7 +117,8 @@ StatusCode Routine<T>::SetUp() {
// Enqueues a kernel, waits for completion, and checks for errors
template <typename T>
StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
- const std::vector<size_t> &local) {
+ const std::vector<size_t> &local, EventPointer event,
+ std::vector<Event>& waitForEvents) {
// Tests for validity of the local thread sizes
if (local.size() > max_work_item_dimensions_) {
@@ -141,18 +142,21 @@ StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
// Launches the kernel (and checks for launch errors)
try {
- kernel.Launch(queue_, global, local, event_);
+ kernel.Launch(queue_, global, local, event, waitForEvents);
} catch (...) { return StatusCode::kKernelLaunchError; }
- // Waits for completion of the kernel
- try {
- queue_.Finish(event_);
- } catch (...) { return StatusCode::kKernelRunError; }
-
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
+// As above, but without an event waiting list
+template <typename T>
+StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
+ const std::vector<size_t> &local, EventPointer event) {
+ auto emptyWaitingList = std::vector<Event>();
+ return RunKernel(kernel, global, local, event, emptyWaitingList);
+}
+
// =================================================================================================
// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
@@ -258,7 +262,8 @@ StatusCode Routine<T>::TestVectorDot(const size_t n, const Buffer<T> &buffer, co
// Copies or transposes a matrix and pads/unpads it with zeros
template <typename T>
-StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
+StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
+ const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
@@ -340,13 +345,13 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t
auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"],
dest_two / db_["TRA_WPT"]};
auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event, waitForEvents);
}
else {
auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])};
auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event, waitForEvents);
}
}
else {
@@ -354,13 +359,13 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t
auto global = std::vector<size_t>{dest_one / db_["COPY_VW"],
dest_two / db_["COPY_WPT"]};
auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event, waitForEvents);
}
else {
auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event, waitForEvents);
}
}
return status;
diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc
index c5acaf49..37d23543 100644
--- a/src/routines/level1/xaxpy.cc
+++ b/src/routines/level1/xaxpy.cc
@@ -29,7 +29,7 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xaxpy<T>::Xaxpy(Queue &queue, Event &event, const std::string &name):
+Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
@@ -89,13 +89,13 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
}
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc
index 8c7f8671..04508383 100644
--- a/src/routines/level1/xcopy.cc
+++ b/src/routines/level1/xcopy.cc
@@ -29,7 +29,7 @@ template <> const Precision Xcopy<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xcopy<T>::Xcopy(Queue &queue, Event &event, const std::string &name):
+Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
@@ -87,13 +87,13 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
}
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc
index e22b0f8b..4813a004 100644
--- a/src/routines/level1/xdot.cc
+++ b/src/routines/level1/xdot.cc
@@ -29,7 +29,7 @@ template <> const Precision Xdot<double2>::precision_ = Precision::kComplexDoubl
// Constructor: forwards to base class constructor
template <typename T>
-Xdot<T>::Xdot(Queue &queue, Event &event, const std::string &name):
+Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
source_string_ =
#include "../../kernels/level1/xdot.opencl"
@@ -78,11 +78,16 @@ StatusCode Xdot<T>::DoDot(const size_t n,
kernel1.SetArgument(7, temp_buffer());
kernel1.SetArgument(8, static_cast<int>(do_conjugate));
+ // Event waiting list
+ auto eventWaitList = std::vector<Event>();
+
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
- status = RunKernel(kernel1, global1, local1);
+ auto kernelEvent = Event();
+ status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
@@ -92,7 +97,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, global2, local2);
+ status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level1/xdotc.cc b/src/routines/level1/xdotc.cc
index f414f556..b3a01079 100644
--- a/src/routines/level1/xdotc.cc
+++ b/src/routines/level1/xdotc.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xdotc<T>::Xdotc(Queue &queue, Event &event, const std::string &name):
+Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):
Xdot<T>(queue, event, name) {
}
diff --git a/src/routines/level1/xdotu.cc b/src/routines/level1/xdotu.cc
index 28d9b730..8dded6e0 100644
--- a/src/routines/level1/xdotu.cc
+++ b/src/routines/level1/xdotu.cc
@@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xdotu<T>::Xdotu(Queue &queue, Event &event, const std::string &name):
+Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):
Xdot<T>(queue, event, name) {
}
diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc
index 685eb29f..04e4137c 100644
--- a/src/routines/level1/xnrm2.cc
+++ b/src/routines/level1/xnrm2.cc
@@ -29,7 +29,7 @@ template <> const Precision Xnrm2<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xnrm2<T>::Xnrm2(Queue &queue, Event &event, const std::string &name):
+Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
source_string_ =
#include "../../kernels/level1/xnrm2.opencl"
@@ -69,12 +69,16 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
+ // Event waiting list
+ auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
- status = RunKernel(kernel1, global1, local1);
+ auto kernelEvent = Event();
+ status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
@@ -84,7 +88,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
- status = RunKernel(kernel2, global2, local2);
+ status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc
index 57bbe9e8..e83e73fd 100644
--- a/src/routines/level1/xscal.cc
+++ b/src/routines/level1/xscal.cc
@@ -29,7 +29,7 @@ template <> const Precision Xscal<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xscal<T>::Xscal(Queue &queue, Event &event, const std::string &name):
+Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
@@ -81,13 +81,13 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
}
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc
index c986b3fb..bc425f40 100644
--- a/src/routines/level1/xswap.cc
+++ b/src/routines/level1/xswap.cc
@@ -29,7 +29,7 @@ template <> const Precision Xswap<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xswap<T>::Xswap(Queue &queue, Event &event, const std::string &name):
+Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
@@ -87,13 +87,13 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
}
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc
index 14d391ca..f90e26b2 100644
--- a/src/routines/level2/xgbmv.cc
+++ b/src/routines/level2/xgbmv.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name):
+Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
index bf7ae6fa..24e87db0 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -29,7 +29,7 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
+Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
source_string_ =
#include "../../kernels/level2/xgemv.opencl"
@@ -162,7 +162,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
// Launches the kernel
auto global = std::vector<size_t>{global_size};
auto local = std::vector<size_t>{local_size};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc
index 9ab21bfb..dda78232 100644
--- a/src/routines/level2/xger.cc
+++ b/src/routines/level2/xger.cc
@@ -29,7 +29,7 @@ template <> const Precision Xger<double2>::precision_ = Precision::kComplexDoubl
// Constructor: forwards to base class constructor
template <typename T>
-Xger<T>::Xger(Queue &queue, Event &event, const std::string &name):
+Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xger"}, precision_) {
source_string_ =
#include "../../kernels/level2/level2.opencl"
@@ -89,7 +89,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level2/xgerc.cc b/src/routines/level2/xgerc.cc
index 09408898..73284b52 100644
--- a/src/routines/level2/xgerc.cc
+++ b/src/routines/level2/xgerc.cc
@@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xgerc<T>::Xgerc(Queue &queue, Event &event, const std::string &name):
+Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name):
Xger<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xgeru.cc b/src/routines/level2/xgeru.cc
index 36fd9d0a..7730d6a5 100644
--- a/src/routines/level2/xgeru.cc
+++ b/src/routines/level2/xgeru.cc
@@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xgeru<T>::Xgeru(Queue &queue, Event &event, const std::string &name):
+Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name):
Xger<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc
index f59a7cb3..58591b50 100644
--- a/src/routines/level2/xhbmv.cc
+++ b/src/routines/level2/xhbmv.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xhbmv<T>::Xhbmv(Queue &queue, Event &event, const std::string &name):
+Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc
index 5a58b28b..b4ef0fa4 100644
--- a/src/routines/level2/xhemv.cc
+++ b/src/routines/level2/xhemv.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xhemv<T>::Xhemv(Queue &queue, Event &event, const std::string &name):
+Xhemv<T>::Xhemv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc
index 1aefa240..aba665b0 100644
--- a/src/routines/level2/xher.cc
+++ b/src/routines/level2/xher.cc
@@ -28,7 +28,7 @@ template <> const Precision Xher<double2, double>::precision_ = Precision::kComp
// Constructor: forwards to base class constructor
template <typename T, typename U>
-Xher<T,U>::Xher(Queue &queue, Event &event, const std::string &name):
+Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xger"}, precision_) {
source_string_ =
#include "../../kernels/level2/level2.opencl"
@@ -99,7 +99,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc
index 364add12..bcd6488f 100644
--- a/src/routines/level2/xher2.cc
+++ b/src/routines/level2/xher2.cc
@@ -28,7 +28,7 @@ template <> const Precision Xher2<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xher2<T>::Xher2(Queue &queue, Event &event, const std::string &name):
+Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xger"}, precision_) {
source_string_ =
#include "../../kernels/level2/level2.opencl"
@@ -91,7 +91,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
- status = RunKernel(kernel, global, local);
+ status = RunKernel(kernel, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc
index 2269255d..92686dbe 100644
--- a/src/routines/level2/xhpmv.cc
+++ b/src/routines/level2/xhpmv.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xhpmv<T>::Xhpmv(Queue &queue, Event &event, const std::string &name):
+Xhpmv<T>::Xhpmv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xhpr.cc b/src/routines/level2/xhpr.cc
index b0cea72f..4b31ad09 100644
--- a/src/routines/level2/xhpr.cc
+++ b/src/routines/level2/xhpr.cc
@@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T, typename U>
-Xhpr<T,U>::Xhpr(Queue &queue, Event &event, const std::string &name):
+Xhpr<T,U>::Xhpr(Queue &queue, EventPointer event, const std::string &name):
Xher<T,U>(queue, event, name) {
}
diff --git a/src/routines/level2/xhpr2.cc b/src/routines/level2/xhpr2.cc
index ded35e53..9be24f43 100644
--- a/src/routines/level2/xhpr2.cc
+++ b/src/routines/level2/xhpr2.cc
@@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xhpr2<T>::Xhpr2(Queue &queue, Event &event, const std::string &name):
+Xhpr2<T>::Xhpr2(Queue &queue, EventPointer event, const std::string &name):
Xher2<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc
index 457bd762..bc82c88d 100644
--- a/src/routines/level2/xsbmv.cc
+++ b/src/routines/level2/xsbmv.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xsbmv<T>::Xsbmv(Queue &queue, Event &event, const std::string &name):
+Xsbmv<T>::Xsbmv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc
index 4f1a9c61..6e00dcfa 100644
--- a/src/routines/level2/xspmv.cc
+++ b/src/routines/level2/xspmv.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xspmv<T>::Xspmv(Queue &queue, Event &event, const std::string &name):
+Xspmv<T>::Xspmv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cc
index 2d998e0b..55af2f29 100644
--- a/src/routines/level2/xspr.cc
+++ b/src/routines/level2/xspr.cc
@@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xspr<T>::Xspr(Queue &queue, Event &event, const std::string &name):
+Xspr<T>::Xspr(Queue &queue, EventPointer event, const std::string &name):
Xher<T,T>(queue, event, name) {
}
diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cc
index fd5232da..9a3f97ce 100644
--- a/src/routines/level2/xspr2.cc
+++ b/src/routines/level2/xspr2.cc
@@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xspr2<T>::Xspr2(Queue &queue, Event &event, const std::string &name):
+Xspr2<T>::Xspr2(Queue &queue, EventPointer event, const std::string &name):
Xher2<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc
index ec12324b..a9eb284f 100644
--- a/src/routines/level2/xsymv.cc
+++ b/src/routines/level2/xsymv.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xsymv<T>::Xsymv(Queue &queue, Event &event, const std::string &name):
+Xsymv<T>::Xsymv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cc
index c01fa2d3..4b3928e5 100644
--- a/src/routines/level2/xsyr.cc
+++ b/src/routines/level2/xsyr.cc
@@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xsyr<T>::Xsyr(Queue &queue, Event &event, const std::string &name):
+Xsyr<T>::Xsyr(Queue &queue, EventPointer event, const std::string &name):
Xher<T,T>(queue, event, name) {
}
diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cc
index 6db55085..3ae389e0 100644
--- a/src/routines/level2/xsyr2.cc
+++ b/src/routines/level2/xsyr2.cc
@@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xsyr2<T>::Xsyr2(Queue &queue, Event &event, const std::string &name):
+Xsyr2<T>::Xsyr2(Queue &queue, EventPointer event, const std::string &name):
Xher2<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc
index 2e1aebff..47371c87 100644
--- a/src/routines/level2/xtbmv.cc
+++ b/src/routines/level2/xtbmv.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xtbmv<T>::Xtbmv(Queue &queue, Event &event, const std::string &name):
+Xtbmv<T>::Xtbmv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc
index aa0e099b..c63cb9b2 100644
--- a/src/routines/level2/xtpmv.cc
+++ b/src/routines/level2/xtpmv.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xtpmv<T>::Xtpmv(Queue &queue, Event &event, const std::string &name):
+Xtpmv<T>::Xtpmv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc
index 94424743..9111d41d 100644
--- a/src/routines/level2/xtrmv.cc
+++ b/src/routines/level2/xtrmv.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xtrmv<T>::Xtrmv(Queue &queue, Event &event, const std::string &name):
+Xtrmv<T>::Xtrmv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index 5dc2ad7f..7557dcc3 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -29,7 +29,7 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xgemm<T>::Xgemm(Queue &queue, Event &event, const std::string &name):
+Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/level3/copy.opencl"
@@ -122,30 +122,43 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
- status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+ auto eventProcessA = Event();
+ status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
program, true, a_do_transpose, a_conjugate);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventProcessA);
}
// As above, but now for matrix B
if (!b_no_temp) {
- status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
+ auto eventProcessB = Event();
+ status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
+ b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
program, true, b_do_transpose, b_conjugate);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventProcessB);
}
// As above, but now for matrix C. This is only necessary if C is used both as input and output.
if (!c_no_temp && beta != static_cast<T>(0)) {
- status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
+ auto eventProcessC = Event();
+ status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+ c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
program, true, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventProcessC);
}
// Retrieves the Xgemm kernel from the compiled binary
@@ -170,12 +183,15 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
- status = RunKernel(kernel, global, local);
+ auto eventKernel = Event();
+ status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventKernel);
// Runs the post-processing kernel if needed
if (!c_no_temp) {
- status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
+ status = PadCopyTransposeMatrix(event_, eventWaitList,
+ m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
program, false, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc
index bcc60dee..c0a4306a 100644
--- a/src/routines/level3/xhemm.cc
+++ b/src/routines/level3/xhemm.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xhemm<T>::Xhemm(Queue &queue, Event &event, const std::string &name):
+Xhemm<T>::Xhemm(Queue &queue, EventPointer event, const std::string &name):
Xgemm<T>(queue, event, name) {
}
@@ -79,9 +79,13 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- status = RunKernel(kernel, global, local);
+ auto kernelEvent = Event();
+ status = RunKernel(kernel, global, local, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
+ // Synchronize now: 'DoGemm' does not accept a list of events to wait for
+ kernelEvent.WaitForCompletion();
+
// Runs the regular Xgemm code with either "C := AB+C" or ...
if (side == Side::kLeft) {
status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
index 1711905d..4d5a4d35 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@@ -27,7 +27,7 @@ template <> const Precision Xher2k<double2,double>::precision_ = Precision::kCom
// Constructor: forwards to base class constructor
template <typename T, typename U>
-Xher2k<T,U>::Xher2k(Queue &queue, Event &event, const std::string &name):
+Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/level3/copy.opencl"
@@ -112,39 +112,58 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a1_no_temp) {
- status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+ auto eventProcessA1 = Event();
+ status = PadCopyTransposeMatrix(eventProcessA1.pointer(), emptyEventList,
+ ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
program, true, ab_rotated, ab_conjugate);
+ eventWaitList.push_back(eventProcessA1);
if (ErrorIn(status)) { return status; }
}
if (!a2_no_temp) {
- status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+ auto eventProcessA2 = Event();
+ status = PadCopyTransposeMatrix(eventProcessA2.pointer(), emptyEventList,
+ ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
program, true, ab_rotated, !ab_conjugate);
+ eventWaitList.push_back(eventProcessA2);
if (ErrorIn(status)) { return status; }
}
if (!b1_no_temp) {
- status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+ auto eventProcessB1 = Event();
+ status = PadCopyTransposeMatrix(eventProcessB1.pointer(), emptyEventList,
+ ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
program, true, ab_rotated, ab_conjugate);
+ eventWaitList.push_back(eventProcessB1);
if (ErrorIn(status)) { return status; }
}
if (!b2_no_temp) {
- status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+ auto eventProcessB2 = Event();
+ status = PadCopyTransposeMatrix(eventProcessB2.pointer(), emptyEventList,
+ ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
program, true, ab_rotated, !ab_conjugate);
+ eventWaitList.push_back(eventProcessB2);
if (ErrorIn(status)) { return status; }
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
- status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+ auto eventProcessC = Event();
+ status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
program, true, c_rotated, false);
+ eventWaitList.push_back(eventProcessC);
if (ErrorIn(status)) { return status; }
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
@@ -169,8 +188,10 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
- status = RunKernel(kernel, global, local);
+ auto eventKernel1 = Event();
+ status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventKernel1);
// Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
@@ -181,13 +202,16 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
kernel.SetArgument(5, a2_temp());
// Runs the kernel again
- status = RunKernel(kernel, global, local);
+ auto eventKernel2 = Event();
+ status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventKernel2);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ status = PadCopyTransposeMatrix(event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
program, false, c_rotated, false, upper, lower, true);
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
index cbd0a188..574debe4 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@@ -27,7 +27,7 @@ template <> const Precision Xherk<double2,double>::precision_ = Precision::kComp
// Constructor: forwards to base class constructor
template <typename T, typename U>
-Xherk<T,U>::Xherk(Queue &queue, Event &event, const std::string &name):
+Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/level3/copy.opencl"
@@ -103,27 +103,40 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped. Two copies are created.
if (!a_no_temp) {
- status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+ auto eventProcessA = Event();
+ status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
program, true, a_rotated, a_conjugate);
+ eventWaitList.push_back(eventProcessA);
if (ErrorIn(status)) { return status; }
}
if (!b_no_temp) {
- status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+ auto eventProcessB = Event();
+ status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
program, true, a_rotated, b_conjugate);
+ eventWaitList.push_back(eventProcessB);
if (ErrorIn(status)) { return status; }
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
- status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+ auto eventProcessC = Event();
+ status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
program, true, c_rotated, false);
+ eventWaitList.push_back(eventProcessC);
if (ErrorIn(status)) { return status; }
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
@@ -149,13 +162,16 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
- status = RunKernel(kernel, global, local);
+ auto eventKernel = Event();
+ status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventKernel);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ status = PadCopyTransposeMatrix(event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
program, false, c_rotated, false, upper, lower, true);
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc
index 583d5c7d..914a326a 100644
--- a/src/routines/level3/xsymm.cc
+++ b/src/routines/level3/xsymm.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xsymm<T>::Xsymm(Queue &queue, Event &event, const std::string &name):
+Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name):
Xgemm<T>(queue, event, name) {
}
@@ -79,9 +79,13 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- status = RunKernel(kernel, global, local);
+ auto kernelEvent = Event();
+ status = RunKernel(kernel, global, local, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
+ // Synchronize now: 'DoGemm' does not accept a list of events to wait for
+ kernelEvent.WaitForCompletion();
+
// Runs the regular Xgemm code with either "C := AB+C" or ...
if (side == Side::kLeft) {
status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
index 79090871..44d0024e 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@@ -29,7 +29,7 @@ template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDou
// Constructor: forwards to base class constructor
template <typename T>
-Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event, const std::string &name):
+Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/level3/copy.opencl"
@@ -104,28 +104,41 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
- status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+ auto eventProcessA = Event();
+ status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+ ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
program, true, ab_rotated, false);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventProcessA);
}
if (!b_no_temp) {
- status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+ auto eventProcessB = Event();
+ status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
+ ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
program, true, ab_rotated, false);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventProcessB);
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
- status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+ auto eventProcessC = Event();
+ status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
program, true, c_rotated, false);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventProcessC);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
@@ -148,8 +161,10 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
- status = RunKernel(kernel, global, local);
+ auto eventKernel1 = Event();
+ status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventKernel1);
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
auto one = static_cast<T>(1);
@@ -158,13 +173,16 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
kernel.SetArgument(5, a_temp());
// Runs the kernel again
- status = RunKernel(kernel, global, local);
+ auto eventKernel2 = Event();
+ status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventKernel2);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ status = PadCopyTransposeMatrix(event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
program, false, c_rotated, false, upper, lower, false);
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
index ca429bd7..44ed8d35 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@@ -29,7 +29,7 @@ template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xsyrk<T>::Xsyrk(Queue &queue, Event &event, const std::string &name):
+Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/level3/copy.opencl"
@@ -97,22 +97,32 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+ // Events of all kernels (including pre/post processing kernels)
+ auto eventWaitList = std::vector<Event>();
+ auto emptyEventList = std::vector<Event>();
+
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
- status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+ auto eventProcessA = Event();
+ status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
program, true, a_rotated, false);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventProcessA);
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
- status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+ auto eventProcessC = Event();
+ status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+ n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
program, true, c_rotated, false);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventProcessC);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
@@ -135,17 +145,21 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
- status = RunKernel(kernel, global, local);
+ auto eventKernel = Event();
+ status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
+ eventWaitList.push_back(eventKernel);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
- status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+ status = PadCopyTransposeMatrix(event_, eventWaitList,
+ n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
program, false, c_rotated, false, upper, lower, false);
if (ErrorIn(status)) { return status; }
+
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc
index 1180c026..484cf040 100644
--- a/src/routines/level3/xtrmm.cc
+++ b/src/routines/level3/xtrmm.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xtrmm<T>::Xtrmm(Queue &queue, Event &event, const std::string &name):
+Xtrmm<T>::Xtrmm(Queue &queue, EventPointer event, const std::string &name):
Xgemm<T>(queue, event, name) {
}
@@ -82,9 +82,13 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
- status = RunKernel(kernel, global, local);
+ auto kernelEvent = Event();
+ status = RunKernel(kernel, global, local, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
+ // Synchronize now: 'DoGemm' does not accept a list of events to wait for
+ kernelEvent.WaitForCompletion();
+
// Runs the regular Xgemm code with either "B := alpha*A*B" or ...
if (side == Side::kLeft) {
status = DoGemm(layout, a_transpose, Transpose::kNo,