diff options
83 files changed, 360 insertions, 235 deletions
@@ -255,5 +255,4 @@ To-do list before release of version 1.0 ------------- - Support all routines supported by clBLAS -- Allow the user control over events and synchronization - Add half-precision routines (e.g. HGEMM) diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index 00905ef7..543d423a 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -73,29 +73,41 @@ class Event { public: // Constructor based on the regular OpenCL data-type - explicit Event(cl_event* event): event_(event) { } + explicit Event(const cl_event event): event_(event) { } + + // Regular constructor + explicit Event(): event_(nullptr) { } + + // Waits for completion of this event + void WaitForCompletion() const { + CheckError(clWaitForEvents(1, &event_)); + } // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation: // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx float GetElapsedTime() const { - CheckError(clWaitForEvents(1, event_)); + WaitForCompletion(); auto bytes = size_t{0}; - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); auto time_start = size_t{0}; - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); auto time_end = size_t{0}; - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); return (time_end - time_start) * 1.0e-6f; } // Accessor to the private data-member - cl_event& operator()() { return *event_; } + cl_event& operator()() { return event_; } + cl_event* pointer() { return &event_; } private: - cl_event* event_; + cl_event event_; }; +// Pointer to an OpenCL event +using EventPointer = cl_event*; + // ================================================================================================= // C++11 version of 'cl_platform_id' @@ -600,17 +612,36 @@ class Kernel { // Launches a kernel onto the specified queue void Launch(const Queue &queue, const std::vector<size_t> &global, - const std::vector<size_t> &local, Event &event) { + const std::vector<size_t> &local, EventPointer event) { + CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()), + nullptr, global.data(), local.data(), + 0, nullptr, event)); + } + + // As above, but with an event waiting list + void Launch(const Queue &queue, const std::vector<size_t> &global, + const std::vector<size_t> &local, EventPointer event, + std::vector<Event>& waitForEvents) { + if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); } + + // Builds a plain version of the events waiting list + auto waitForEventsPlain = std::vector<cl_event>(); + for (auto &waitEvent : waitForEvents) { + waitForEventsPlain.push_back(waitEvent()); + } + + // Launches the kernel while waiting for other events CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()), nullptr, global.data(), local.data(), - 0, nullptr, &(event()))); + waitForEventsPlain.size(), waitForEventsPlain.data(), + event)); } // As above, but with the default local workgroup size - void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) { + void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) { CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()), nullptr, global.data(), nullptr, - 0, nullptr, &(event()))); + 0, nullptr, event)); } // Accessor to the private data-member diff --git a/include/internal/routine.h b/include/internal/routine.h index 5f5b8211..b2b6f622 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -55,7 +55,7 @@ class Routine { static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); } // Base class constructor - explicit Routine(Queue &queue, Event &event, const std::string &name, + explicit Routine(Queue &queue, EventPointer event, const std::string &name, const std::vector<std::string> &routines, const Precision precision); // Set-up phase of the kernel @@ -65,7 +65,12 @@ class Routine { // Runs a kernel given the global and local thread sizes StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global, - const std::vector<size_t> &local); + const std::vector<size_t> &local, EventPointer event, + std::vector<Event>& waitForEvents); + + // As above, but without an event waiting list + StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global, + const std::vector<size_t> &local, EventPointer event); // Tests for valid inputs of matrices A, B, and C StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer, @@ -87,7 +92,8 @@ class Routine { // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write // to symmetric and triangular matrices through optional arguments. - StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, + StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents, + const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer<T> &src, const size_t dest_one, const size_t dest_two, @@ -114,7 +120,7 @@ class Routine { // The OpenCL objects, accessible only from derived classes Queue queue_; - Event event_; + EventPointer event_; const Context context_; const Device device_; diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h index 689cf169..bc00c8e3 100644 --- a/include/internal/routines/level1/xaxpy.h +++ b/include/internal/routines/level1/xaxpy.h @@ -28,6 +28,7 @@ class Xaxpy: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; using Routine<T>::TestVectorY; @@ -35,7 +36,7 @@ class Xaxpy: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xaxpy(Queue &queue, Event &event, const std::string &name = "AXPY"); + Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY"); // Templated-precision implementation of the routine StatusCode DoAxpy(const size_t n, const T alpha, diff --git a/include/internal/routines/level1/xcopy.h b/include/internal/routines/level1/xcopy.h index 15f339aa..5786cb0f 100644 --- a/include/internal/routines/level1/xcopy.h +++ b/include/internal/routines/level1/xcopy.h @@ -28,6 +28,7 @@ class Xcopy: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; using Routine<T>::TestVectorY; @@ -35,7 +36,7 @@ class Xcopy: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xcopy(Queue &queue, Event &event, const std::string &name = "COPY"); + Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY"); // Templated-precision implementation of the routine StatusCode DoCopy(const size_t n, diff --git a/include/internal/routines/level1/xdot.h b/include/internal/routines/level1/xdot.h index 64b62945..95a7ad07 100644 --- a/include/internal/routines/level1/xdot.h +++ b/include/internal/routines/level1/xdot.h @@ -28,6 +28,7 @@ class Xdot: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; @@ -37,7 +38,7 @@ class Xdot: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xdot(Queue &queue, Event &event, const std::string &name = "DOT"); + Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT"); // Templated-precision implementation of the routine StatusCode DoDot(const size_t n, diff --git a/include/internal/routines/level1/xdotc.h b/include/internal/routines/level1/xdotc.h index 726cec7c..0dc2cfe9 100644 --- a/include/internal/routines/level1/xdotc.h +++ b/include/internal/routines/level1/xdotc.h @@ -28,7 +28,7 @@ class Xdotc: public Xdot<T> { using Xdot<T>::DoDot; // Constructor - Xdotc(Queue &queue, Event &event, const std::string &name = "DOTC"); + Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC"); // Templated-precision implementation of the routine StatusCode DoDotc(const size_t n, diff --git a/include/internal/routines/level1/xdotu.h b/include/internal/routines/level1/xdotu.h index 825ebb78..98988744 100644 --- a/include/internal/routines/level1/xdotu.h +++ b/include/internal/routines/level1/xdotu.h @@ -28,7 +28,7 @@ class Xdotu: public Xdot<T> { using Xdot<T>::DoDot; // Constructor - Xdotu(Queue &queue, Event &event, const std::string &name = "DOTU"); + Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU"); // Templated-precision implementation of the routine StatusCode DoDotu(const size_t n, diff --git a/include/internal/routines/level1/xnrm2.h b/include/internal/routines/level1/xnrm2.h index b3fffef6..6f6ca74f 100644 --- a/include/internal/routines/level1/xnrm2.h +++ b/include/internal/routines/level1/xnrm2.h @@ -28,6 +28,7 @@ class Xnrm2: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; @@ -36,7 +37,7 @@ class Xnrm2: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xnrm2(Queue &queue, Event &event, const std::string &name = "NRM2"); + Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2"); // Templated-precision implementation of the routine StatusCode DoNrm2(const size_t n, diff --git a/include/internal/routines/level1/xscal.h b/include/internal/routines/level1/xscal.h index d97b5a07..e10a201d 100644 --- a/include/internal/routines/level1/xscal.h +++ b/include/internal/routines/level1/xscal.h @@ -28,13 +28,14 @@ class Xscal: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; using Routine<T>::RunKernel; using Routine<T>::ErrorIn; // Constructor - Xscal(Queue &queue, Event &event, const std::string &name = "SCAL"); + Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL"); // Templated-precision implementation of the routine StatusCode DoScal(const size_t n, const T alpha, diff --git a/include/internal/routines/level1/xswap.h b/include/internal/routines/level1/xswap.h index fe79882b..0f240763 100644 --- a/include/internal/routines/level1/xswap.h +++ b/include/internal/routines/level1/xswap.h @@ -28,6 +28,7 @@ class Xswap: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; using Routine<T>::TestVectorY; @@ -35,7 +36,7 @@ class Xswap: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xswap(Queue &queue, Event &event, const std::string &name = "SWAP"); + Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP"); // Templated-precision implementation of the routine StatusCode DoSwap(const size_t n, diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h index 27b033e9..bc94c77d 100644 --- a/include/internal/routines/level2/xgbmv.h +++ b/include/internal/routines/level2/xgbmv.h @@ -30,7 +30,7 @@ class Xgbmv: public Xgemv<T> { using Xgemv<T>::MatVec; // Constructor - Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV"); + Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV"); // Templated-precision implementation of the routine StatusCode DoGbmv(const Layout layout, const Transpose a_transpose, diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h index b31565ec..0b2a8e66 100644 --- a/include/internal/routines/level2/xgemv.h +++ b/include/internal/routines/level2/xgemv.h @@ -28,6 +28,7 @@ class Xgemv: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; using Routine<T>::TestVectorY; @@ -37,7 +38,7 @@ class Xgemv: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xgemv(Queue &queue, Event &event, const std::string &name = "GEMV"); + Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV"); // Templated-precision implementation of the routine StatusCode DoGemv(const Layout layout, const Transpose a_transpose, diff --git a/include/internal/routines/level2/xger.h b/include/internal/routines/level2/xger.h index 45ecea10..5ace9da6 100644 --- a/include/internal/routines/level2/xger.h +++ b/include/internal/routines/level2/xger.h @@ -28,6 +28,7 @@ class Xger: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; using Routine<T>::TestVectorY; @@ -36,7 +37,7 @@ class Xger: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xger(Queue &queue, Event &event, const std::string &name = "GER"); + Xger(Queue &queue, EventPointer event, const std::string &name = "GER"); // Templated-precision implementation of the routine StatusCode DoGer(const Layout layout, diff --git a/include/internal/routines/level2/xgerc.h b/include/internal/routines/level2/xgerc.h index 8e515a14..6d06ef94 100644 --- a/include/internal/routines/level2/xgerc.h +++ b/include/internal/routines/level2/xgerc.h @@ -28,7 +28,7 @@ class Xgerc: public Xger<T> { using Xger<T>::DoGer; // Constructor - Xgerc(Queue &queue, Event &event, const std::string &name = "GERC"); + Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC"); // Templated-precision implementation of the routine StatusCode DoGerc(const Layout layout, diff --git a/include/internal/routines/level2/xgeru.h b/include/internal/routines/level2/xgeru.h index ec485c37..45ce1cba 100644 --- a/include/internal/routines/level2/xgeru.h +++ b/include/internal/routines/level2/xgeru.h @@ -28,7 +28,7 @@ class Xgeru: public Xger<T> { using Xger<T>::DoGer; // Constructor - Xgeru(Queue &queue, Event &event, const std::string &name = "GERU"); + Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU"); // Templated-precision implementation of the routine StatusCode DoGeru(const Layout layout, diff --git a/include/internal/routines/level2/xhbmv.h b/include/internal/routines/level2/xhbmv.h index 65138424..f0a6212c 100644 --- a/include/internal/routines/level2/xhbmv.h +++ b/include/internal/routines/level2/xhbmv.h @@ -30,7 +30,7 @@ class Xhbmv: public Xgemv<T> { using Xgemv<T>::MatVec; // Constructor - Xhbmv(Queue &queue, Event &event, const std::string &name = "HBMV"); + Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV"); // Templated-precision implementation of the routine StatusCode DoHbmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h index b74db760..3daf2457 100644 --- a/include/internal/routines/level2/xhemv.h +++ b/include/internal/routines/level2/xhemv.h @@ -30,7 +30,7 @@ class Xhemv: public Xgemv<T> { using Xgemv<T>::MatVec; // Constructor - Xhemv(Queue &queue, Event &event, const std::string &name = "HEMV"); + Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV"); // Templated-precision implementation of the routine StatusCode DoHemv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xher.h b/include/internal/routines/level2/xher.h index 6322265b..861ba302 100644 --- a/include/internal/routines/level2/xher.h +++ b/include/internal/routines/level2/xher.h @@ -28,6 +28,7 @@ class Xher: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; using Routine<T>::TestMatrixA; @@ -36,7 +37,7 @@ class Xher: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xher(Queue &queue, Event &event, const std::string &name = "HER"); + Xher(Queue &queue, EventPointer event, const std::string &name = "HER"); // Translates alpha of type 'U' into type 'T' T GetAlpha(const U alpha); diff --git a/include/internal/routines/level2/xher2.h b/include/internal/routines/level2/xher2.h index 26f69046..9a23199e 100644 --- a/include/internal/routines/level2/xher2.h +++ b/include/internal/routines/level2/xher2.h @@ -28,6 +28,7 @@ class Xher2: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; using Routine<T>::TestVectorY; @@ -37,7 +38,7 @@ class Xher2: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xher2(Queue &queue, Event &event, const std::string &name = "HER2"); + Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2"); // Templated-precision implementation of the routine StatusCode DoHer2(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xhpmv.h b/include/internal/routines/level2/xhpmv.h index 48f1ed3f..a1d5595a 100644 --- a/include/internal/routines/level2/xhpmv.h +++ b/include/internal/routines/level2/xhpmv.h @@ -30,7 +30,7 @@ class Xhpmv: public Xgemv<T> { using Xgemv<T>::MatVec; // Constructor - Xhpmv(Queue &queue, Event &event, const std::string &name = "HPMV"); + Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV"); // Templated-precision implementation of the routine StatusCode DoHpmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xhpr.h b/include/internal/routines/level2/xhpr.h index a0c3cb92..6554d74c 100644 --- a/include/internal/routines/level2/xhpr.h +++ b/include/internal/routines/level2/xhpr.h @@ -28,7 +28,7 @@ class Xhpr: public Xher<T,U> { using Xher<T,U>::DoHer; // Constructor - Xhpr(Queue &queue, Event &event, const std::string &name = "HPR"); + Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR"); // Templated-precision implementation of the routine StatusCode DoHpr(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xhpr2.h b/include/internal/routines/level2/xhpr2.h index fd243d33..d95e7b61 100644 --- a/include/internal/routines/level2/xhpr2.h +++ b/include/internal/routines/level2/xhpr2.h @@ -28,7 +28,7 @@ class Xhpr2: public Xher2<T> { using Xher2<T>::DoHer2; // Constructor - Xhpr2(Queue &queue, Event &event, const std::string &name = "HPR2"); + Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2"); // Templated-precision implementation of the routine StatusCode DoHpr2(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xsbmv.h b/include/internal/routines/level2/xsbmv.h index bb24d8f4..4328e377 100644 --- a/include/internal/routines/level2/xsbmv.h +++ b/include/internal/routines/level2/xsbmv.h @@ -30,7 +30,7 @@ class Xsbmv: public Xgemv<T> { using Xgemv<T>::MatVec; // Constructor - Xsbmv(Queue &queue, Event &event, const std::string &name = "SBMV"); + Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV"); // Templated-precision implementation of the routine StatusCode DoSbmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xspmv.h b/include/internal/routines/level2/xspmv.h index 88f02a2f..ca3e28b6 100644 --- a/include/internal/routines/level2/xspmv.h +++ b/include/internal/routines/level2/xspmv.h @@ -30,7 +30,7 @@ class Xspmv: public Xgemv<T> { using Xgemv<T>::MatVec; // Constructor - Xspmv(Queue &queue, Event &event, const std::string &name = "SPMV"); + Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV"); // Templated-precision implementation of the routine StatusCode DoSpmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xspr.h b/include/internal/routines/level2/xspr.h index 5b01d2cb..7e91abc5 100644 --- a/include/internal/routines/level2/xspr.h +++ b/include/internal/routines/level2/xspr.h @@ -28,7 +28,7 @@ class Xspr: public Xher<T,T> { using Xher<T,T>::DoHer; // Constructor - Xspr(Queue &queue, Event &event, const std::string &name = "SPR"); + Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR"); // Templated-precision implementation of the routine StatusCode DoSpr(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xspr2.h b/include/internal/routines/level2/xspr2.h index 3d5f4992..a34be8e8 100644 --- a/include/internal/routines/level2/xspr2.h +++ b/include/internal/routines/level2/xspr2.h @@ -28,7 +28,7 @@ class Xspr2: public Xher2<T> { using Xher2<T>::DoHer2; // Constructor - Xspr2(Queue &queue, Event &event, const std::string &name = "SPR2"); + Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2"); // Templated-precision implementation of the routine StatusCode DoSpr2(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xsymv.h b/include/internal/routines/level2/xsymv.h index c7b92702..98a0ce88 100644 --- a/include/internal/routines/level2/xsymv.h +++ b/include/internal/routines/level2/xsymv.h @@ -30,7 +30,7 @@ class Xsymv: public Xgemv<T> { using Xgemv<T>::MatVec; // Constructor - Xsymv(Queue &queue, Event &event, const std::string &name = "SYMV"); + Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV"); // Templated-precision implementation of the routine StatusCode DoSymv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xsyr.h b/include/internal/routines/level2/xsyr.h index 9704a881..f88498ae 100644 --- a/include/internal/routines/level2/xsyr.h +++ b/include/internal/routines/level2/xsyr.h @@ -28,7 +28,7 @@ class Xsyr: public Xher<T,T> { using Xher<T,T>::DoHer; // Constructor - Xsyr(Queue &queue, Event &event, const std::string &name = "SYR"); + Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR"); // Templated-precision implementation of the routine StatusCode DoSyr(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xsyr2.h b/include/internal/routines/level2/xsyr2.h index f4dc9375..d2d3143a 100644 --- a/include/internal/routines/level2/xsyr2.h +++ b/include/internal/routines/level2/xsyr2.h @@ -28,7 +28,7 @@ class Xsyr2: public Xher2<T> { using Xher2<T>::DoHer2; // Constructor - Xsyr2(Queue &queue, Event &event, const std::string &name = "SYR2"); + Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2"); // Templated-precision implementation of the routine StatusCode DoSyr2(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h index 89c90193..3b358080 100644 --- a/include/internal/routines/level2/xtbmv.h +++ b/include/internal/routines/level2/xtbmv.h @@ -34,7 +34,7 @@ class Xtbmv: public Xgemv<T> { using Xgemv<T>::MatVec; // Constructor - Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV"); + Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV"); // Templated-precision implementation of the routine StatusCode DoTbmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h index 183d3505..f306cf4a 100644 --- a/include/internal/routines/level2/xtpmv.h +++ b/include/internal/routines/level2/xtpmv.h @@ -34,7 +34,7 @@ class Xtpmv: public Xgemv<T> { using Xgemv<T>::MatVec; // Constructor - Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV"); + Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV"); // Templated-precision implementation of the routine StatusCode DoTpmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h index dadfbc98..cf0824a4 100644 --- a/include/internal/routines/level2/xtrmv.h +++ b/include/internal/routines/level2/xtrmv.h @@ -34,7 +34,7 @@ class Xtrmv: public Xgemv<T> { using Xgemv<T>::MatVec; // Constructor - Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV"); + Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV"); // Templated-precision implementation of the routine StatusCode DoTrmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h index 9b40a7fc..85fb0616 100644 --- a/include/internal/routines/level3/xgemm.h +++ b/include/internal/routines/level3/xgemm.h @@ -28,6 +28,7 @@ class Xgemm: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; using Routine<T>::PadCopyTransposeMatrix; @@ -38,7 +39,7 @@ class Xgemm: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xgemm(Queue &queue, Event &event, const std::string &name = "GEMM"); + Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM"); // Templated-precision implementation of the routine StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h index ca38ca08..ec42b569 100644 --- a/include/internal/routines/level3/xhemm.h +++ b/include/internal/routines/level3/xhemm.h @@ -37,7 +37,7 @@ class Xhemm: public Xgemm<T> { using Xgemm<T>::DoGemm; // Constructor - Xhemm(Queue &queue, Event &event, const std::string &name = "HEMM"); + Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM"); // Templated-precision implementation of the routine StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle, diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h index 7113a172..623afd49 100644 --- a/include/internal/routines/level3/xher2k.h +++ b/include/internal/routines/level3/xher2k.h @@ -30,6 +30,7 @@ class Xher2k: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; using Routine<T>::PadCopyTransposeMatrix; @@ -40,7 +41,7 @@ class Xher2k: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xher2k(Queue &queue, Event &event, const std::string &name = "HER2K"); + Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K"); // Templated-precision implementation of the routine StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h index 47112c2c..629695ff 100644 --- a/include/internal/routines/level3/xherk.h +++ b/include/internal/routines/level3/xherk.h @@ -30,6 +30,7 @@ class Xherk: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; using Routine<T>::PadCopyTransposeMatrix; @@ -39,7 +40,7 @@ class Xherk: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xherk(Queue &queue, Event &event, const std::string &name = "HERK"); + Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK"); // Templated-precision implementation of the routine StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h index 9fc80eb4..16ad6f53 100644 --- a/include/internal/routines/level3/xsymm.h +++ b/include/internal/routines/level3/xsymm.h @@ -39,7 +39,7 @@ class Xsymm: public Xgemm<T> { using Xgemm<T>::DoGemm; // Constructor - Xsymm(Queue &queue, Event &event, const std::string &name = "SYMM"); + Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM"); // Templated-precision implementation of the routine StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h index c4679028..88669626 100644 --- a/include/internal/routines/level3/xsyr2k.h +++ b/include/internal/routines/level3/xsyr2k.h @@ -30,6 +30,7 @@ class Xsyr2k: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; using Routine<T>::PadCopyTransposeMatrix; @@ -40,7 +41,7 @@ class Xsyr2k: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xsyr2k(Queue &queue, Event &event, const std::string &name = "SYR2K"); + Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K"); // Templated-precision implementation of the routine StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h index abf6b681..e95c7c1c 100644 --- a/include/internal/routines/level3/xsyrk.h +++ b/include/internal/routines/level3/xsyrk.h @@ -32,6 +32,7 @@ class Xsyrk: public Routine<T> { using Routine<T>::db_; using Routine<T>::source_string_; using Routine<T>::queue_; + using Routine<T>::event_; using Routine<T>::context_; using Routine<T>::GetProgramFromCache; using Routine<T>::PadCopyTransposeMatrix; @@ -41,7 +42,7 @@ class Xsyrk: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xsyrk(Queue &queue, Event &event, const std::string &name = "SYRK"); + Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK"); // Templated-precision implementation of the routine StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h index a1f4d15c..01f6594d 100644 --- a/include/internal/routines/level3/xtrmm.h +++ b/include/internal/routines/level3/xtrmm.h @@ -38,7 +38,7 @@ class Xtrmm: public Xgemm<T> { using Xgemm<T>::DoGemm; // Constructor - Xtrmm(Queue &queue, Event &event, const std::string &name = "TRMM"); + Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM"); // Templated-precision implementation of the routine StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle, diff --git a/samples/sgemm.cc b/samples/sgemm.cc index 785b051c..78f2dee8 100644 --- a/samples/sgemm.cc +++ b/samples/sgemm.cc @@ -61,7 +61,7 @@ int main() { // Creates the OpenCL context, queue, and an event auto context = cl::Context({device}); auto queue = cl::CommandQueue(context, device); - auto event = cl::Event(); + auto event = cl_event{nullptr}; // Populate host matrices with some example data auto host_a = std::vector<float>(m*k); @@ -84,7 +84,6 @@ int main() { // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision. auto queue_plain = queue(); - auto event_plain = event(); auto status = Gemm(clblast::Layout::kRowMajor, clblast::Transpose::kNo, clblast::Transpose::kNo, m, n, k, @@ -93,10 +92,10 @@ int main() { device_b(), 0, b_ld, beta, device_c(), 0, c_ld, - &queue_plain, &event_plain); + &queue_plain, &event); // Record the execution time - event.wait(); + clWaitForEvents(1, &event); auto elapsed_time = std::chrono::steady_clock::now() - start_time; auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count(); diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index bdf6b9d7..8cd35f95 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -169,8 +169,7 @@ def clblast_cc(routines): if routine.implemented: result += routine.RoutineHeaderCPP(12, "")+" {\n" result += " auto queue_cpp = Queue(*queue);\n" - result += " auto event_cpp = Event(event);\n" - result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event_cpp);\n" + result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event);\n" result += " auto status = routine.SetUp();\n" result += " if (status != StatusCode::kSuccess) { return status; }\n" result += " return routine.Do"+routine.name.capitalize()+"(" diff --git a/src/clblast.cc b/src/clblast.cc index 75893ee9..4f4b6078 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -161,8 +161,7 @@ StatusCode Swap(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xswap<T>(queue_cpp, event_cpp); + auto routine = Xswap<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSwap(n, @@ -193,8 +192,7 @@ StatusCode Scal(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xscal<T>(queue_cpp, event_cpp); + auto routine = Xscal<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoScal(n, @@ -225,8 +223,7 @@ StatusCode Copy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xcopy<T>(queue_cpp, event_cpp); + auto routine = Xcopy<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoCopy(n, @@ -258,8 +255,7 @@ StatusCode Axpy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xaxpy<T>(queue_cpp, event_cpp); + auto routine = Xaxpy<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoAxpy(n, @@ -296,8 +292,7 @@ StatusCode Dot(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xdot<T>(queue_cpp, event_cpp); + auto routine = Xdot<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoDot(n, @@ -324,8 +319,7 @@ StatusCode Dotu(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xdotu<T>(queue_cpp, event_cpp); + auto routine = Xdotu<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoDotu(n, @@ -352,8 +346,7 @@ StatusCode Dotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xdotc<T>(queue_cpp, event_cpp); + auto routine = Xdotc<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoDotc(n, @@ -379,8 +372,7 @@ StatusCode Nrm2(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xnrm2<T>(queue_cpp, event_cpp); + auto routine = Xnrm2<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoNrm2(n, @@ -419,8 +411,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xgemv<T>(queue_cpp, event_cpp); + auto routine = Xgemv<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGemv(layout, a_transpose, @@ -475,8 +466,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xgbmv<T>(queue_cpp, event_cpp); + auto routine = Xgbmv<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGbmv(layout, a_transpose, @@ -531,8 +521,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhemv<T>(queue_cpp, event_cpp); + auto routine = Xhemv<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHemv(layout, triangle, @@ -571,8 +560,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhbmv<T>(queue_cpp, event_cpp); + auto routine = Xhbmv<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHbmv(layout, triangle, @@ -611,8 +599,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhpmv<T>(queue_cpp, event_cpp); + auto routine = Xhpmv<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHpmv(layout, triangle, @@ -651,8 +638,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsymv<T>(queue_cpp, event_cpp); + auto routine = Xsymv<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSymv(layout, triangle, @@ -691,8 +677,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsbmv<T>(queue_cpp, event_cpp); + auto routine = Xsbmv<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSbmv(layout, triangle, @@ -731,8 +716,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xspmv<T>(queue_cpp, event_cpp); + auto routine = Xspmv<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSpmv(layout, triangle, @@ -768,8 +752,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xtrmv<T>(queue_cpp, event_cpp); + auto routine = Xtrmv<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoTrmv(layout, triangle, a_transpose, diagonal, @@ -806,8 +789,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xtbmv<T>(queue_cpp, event_cpp); + auto routine = Xtbmv<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoTbmv(layout, triangle, a_transpose, diagonal, @@ -844,8 +826,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xtpmv<T>(queue_cpp, event_cpp); + auto routine = Xtpmv<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoTpmv(layout, triangle, a_transpose, diagonal, @@ -974,8 +955,7 @@ StatusCode Ger(const Layout layout, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xger<T>(queue_cpp, event_cpp); + auto routine = Xger<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGer(layout, @@ -1010,8 +990,7 @@ StatusCode Geru(const Layout layout, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xgeru<T>(queue_cpp, event_cpp); + auto routine = Xgeru<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGeru(layout, @@ -1046,8 +1025,7 @@ StatusCode Gerc(const Layout layout, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xgerc<T>(queue_cpp, event_cpp); + auto routine = Xgerc<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGerc(layout, @@ -1081,8 +1059,7 @@ StatusCode Her(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xher<std::complex<T>,T>(queue_cpp, event_cpp); + auto routine = Xher<std::complex<T>,T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHer(layout, triangle, @@ -1113,8 +1090,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event_cpp); + auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHpr(layout, triangle, @@ -1146,8 +1122,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xher2<T>(queue_cpp, event_cpp); + auto routine = Xher2<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHer2(layout, triangle, @@ -1182,8 +1157,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhpr2<T>(queue_cpp, event_cpp); + auto routine = Xhpr2<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHpr2(layout, triangle, @@ -1217,8 +1191,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsyr<T>(queue_cpp, event_cpp); + auto routine = Xsyr<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSyr(layout, triangle, @@ -1249,8 +1222,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xspr<T>(queue_cpp, event_cpp); + auto routine = Xspr<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSpr(layout, triangle, @@ -1282,8 +1254,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsyr2<T>(queue_cpp, event_cpp); + auto routine = Xsyr2<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSyr2(layout, triangle, @@ -1318,8 +1289,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xspr2<T>(queue_cpp, event_cpp); + auto routine = Xspr2<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSpr2(layout, triangle, @@ -1359,8 +1329,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xgemm<T>(queue_cpp, event_cpp); + auto routine = Xgemm<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGemm(layout, a_transpose, b_transpose, @@ -1415,8 +1384,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsymm<T>(queue_cpp, event_cpp); + auto routine = Xsymm<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSymm(layout, side, triangle, @@ -1471,8 +1439,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhemm<T>(queue_cpp, event_cpp); + auto routine = Xhemm<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHemm(layout, side, triangle, @@ -1510,8 +1477,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsyrk<T>(queue_cpp, event_cpp); + auto routine = Xsyrk<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSyrk(layout, triangle, a_transpose, @@ -1560,8 +1526,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xherk<std::complex<T>,T>(queue_cpp, event_cpp); + auto routine = Xherk<std::complex<T>,T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHerk(layout, triangle, a_transpose, @@ -1597,8 +1562,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsyr2k<T>(queue_cpp, event_cpp); + auto routine = Xsyr2k<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSyr2k(layout, triangle, ab_transpose, @@ -1653,8 +1617,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xher2k<T,U>(queue_cpp, event_cpp); + auto routine = Xher2k<T,U>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHer2k(layout, triangle, ab_transpose, @@ -1691,8 +1654,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xtrmm<T>(queue_cpp, event_cpp); + auto routine = Xtrmm<T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, diff --git a/src/routine.cc b/src/routine.cc index ff7b3e1a..b5ba63eb 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -26,7 +26,7 @@ template <typename T> std::mutex Routine<T>::program_cache_mutex_; // Constructor: not much here, because no status codes can be returned template <typename T> -Routine<T>::Routine(Queue &queue, Event &event, const std::string &name, +Routine<T>::Routine(Queue &queue, EventPointer event, const std::string &name, const std::vector<std::string> &routines, const Precision precision): precision_(precision), routine_name_(name), @@ -117,7 +117,8 @@ StatusCode Routine<T>::SetUp() { // Enqueues a kernel, waits for completion, and checks for errors template <typename T> StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global, - const std::vector<size_t> &local) { + const std::vector<size_t> &local, EventPointer event, + std::vector<Event>& waitForEvents) { // Tests for validity of the local thread sizes if (local.size() > max_work_item_dimensions_) { @@ -141,18 +142,21 @@ StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global, // Launches the kernel (and checks for launch errors) try { - kernel.Launch(queue_, global, local, event_); + kernel.Launch(queue_, global, local, event, waitForEvents); } catch (...) { return StatusCode::kKernelLaunchError; } - // Waits for completion of the kernel - try { - queue_.Finish(event_); - } catch (...) { return StatusCode::kKernelRunError; } - // No errors, normal termination of this function return StatusCode::kSuccess; } +// As above, but without an event waiting list +template <typename T> +StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global, + const std::vector<size_t> &local, EventPointer event) { + auto emptyWaitingList = std::vector<Event>(); + return RunKernel(kernel, global, local, event, emptyWaitingList); +} + // ================================================================================================= // Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a @@ -258,7 +262,8 @@ StatusCode Routine<T>::TestVectorDot(const size_t n, const Buffer<T> &buffer, co // Copies or transposes a matrix and pads/unpads it with zeros template <typename T> -StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, +StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents, + const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer<T> &src, const size_t dest_one, const size_t dest_two, @@ -340,13 +345,13 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"], dest_two / db_["TRA_WPT"]}; auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event, waitForEvents); } else { auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])}; auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event, waitForEvents); } } else { @@ -354,13 +359,13 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t auto global = std::vector<size_t>{dest_one / db_["COPY_VW"], dest_two / db_["COPY_WPT"]}; auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event, waitForEvents); } else { auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event, waitForEvents); } } return status; diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index c5acaf49..37d23543 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -29,7 +29,7 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xaxpy<T>::Xaxpy(Queue &queue, Event &event, const std::string &name): +Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" @@ -89,13 +89,13 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, if (use_fast_kernel) { auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc index 8c7f8671..04508383 100644 --- a/src/routines/level1/xcopy.cc +++ b/src/routines/level1/xcopy.cc @@ -29,7 +29,7 @@ template <> const Precision Xcopy<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xcopy<T>::Xcopy(Queue &queue, Event &event, const std::string &name): +Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" @@ -87,13 +87,13 @@ StatusCode Xcopy<T>::DoCopy(const size_t n, if (use_fast_kernel) { auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc index e22b0f8b..4813a004 100644 --- a/src/routines/level1/xdot.cc +++ b/src/routines/level1/xdot.cc @@ -29,7 +29,7 @@ template <> const Precision Xdot<double2>::precision_ = Precision::kComplexDoubl // Constructor: forwards to base class constructor template <typename T> -Xdot<T>::Xdot(Queue &queue, Event &event, const std::string &name): +Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Xdot"}, precision_) { source_string_ = #include "../../kernels/level1/xdot.opencl" @@ -78,11 +78,16 @@ StatusCode Xdot<T>::DoDot(const size_t n, kernel1.SetArgument(7, temp_buffer()); kernel1.SetArgument(8, static_cast<int>(do_conjugate)); + // Event waiting list + auto eventWaitList = std::vector<Event>(); + // Launches the main kernel auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; auto local1 = std::vector<size_t>{db_["WGS1"]}; - status = RunKernel(kernel1, global1, local1); + auto kernelEvent = Event(); + status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); // Sets the arguments for the epilogue kernel kernel2.SetArgument(0, temp_buffer()); @@ -92,7 +97,7 @@ StatusCode Xdot<T>::DoDot(const size_t n, // Launches the epilogue kernel auto global2 = std::vector<size_t>{db_["WGS2"]}; auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2); + status = RunKernel(kernel2, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xdotc.cc b/src/routines/level1/xdotc.cc index f414f556..b3a01079 100644 --- a/src/routines/level1/xdotc.cc +++ b/src/routines/level1/xdotc.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xdotc<T>::Xdotc(Queue &queue, Event &event, const std::string &name): +Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name): Xdot<T>(queue, event, name) { } diff --git a/src/routines/level1/xdotu.cc b/src/routines/level1/xdotu.cc index 28d9b730..8dded6e0 100644 --- a/src/routines/level1/xdotu.cc +++ b/src/routines/level1/xdotu.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xdotu<T>::Xdotu(Queue &queue, Event &event, const std::string &name): +Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name): Xdot<T>(queue, event, name) { } diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc index 685eb29f..04e4137c 100644 --- a/src/routines/level1/xnrm2.cc +++ b/src/routines/level1/xnrm2.cc @@ -29,7 +29,7 @@ template <> const Precision Xnrm2<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xnrm2<T>::Xnrm2(Queue &queue, Event &event, const std::string &name): +Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Xdot"}, precision_) { source_string_ = #include "../../kernels/level1/xnrm2.opencl" @@ -69,12 +69,16 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n, kernel1.SetArgument(2, static_cast<int>(x_offset)); kernel1.SetArgument(3, static_cast<int>(x_inc)); kernel1.SetArgument(4, temp_buffer()); + // Event waiting list + auto eventWaitList = std::vector<Event>(); // Launches the main kernel auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size}; auto local1 = std::vector<size_t>{db_["WGS1"]}; - status = RunKernel(kernel1, global1, local1); + auto kernelEvent = Event(); + status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); // Sets the arguments for the epilogue kernel kernel2.SetArgument(0, temp_buffer()); @@ -84,7 +88,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n, // Launches the epilogue kernel auto global2 = std::vector<size_t>{db_["WGS2"]}; auto local2 = std::vector<size_t>{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2); + status = RunKernel(kernel2, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc index 57bbe9e8..e83e73fd 100644 --- a/src/routines/level1/xscal.cc +++ b/src/routines/level1/xscal.cc @@ -29,7 +29,7 @@ template <> const Precision Xscal<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xscal<T>::Xscal(Queue &queue, Event &event, const std::string &name): +Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" @@ -81,13 +81,13 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha, if (use_fast_kernel) { auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc index c986b3fb..bc425f40 100644 --- a/src/routines/level1/xswap.cc +++ b/src/routines/level1/xswap.cc @@ -29,7 +29,7 @@ template <> const Precision Xswap<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xswap<T>::Xswap(Queue &queue, Event &event, const std::string &name): +Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" @@ -87,13 +87,13 @@ StatusCode Xswap<T>::DoSwap(const size_t n, if (use_fast_kernel) { auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector<size_t>{n_ceiled/db_["WPT"]}; auto local = std::vector<size_t>{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc index 14d391ca..f90e26b2 100644 --- a/src/routines/level2/xgbmv.cc +++ b/src/routines/level2/xgbmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name): +Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv<T>(queue, event, name) { } diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index bf7ae6fa..24e87db0 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -29,7 +29,7 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name): +Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) { source_string_ = #include "../../kernels/level2/xgemv.opencl" @@ -162,7 +162,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, // Launches the kernel auto global = std::vector<size_t>{global_size}; auto local = std::vector<size_t>{local_size}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc index 9ab21bfb..dda78232 100644 --- a/src/routines/level2/xger.cc +++ b/src/routines/level2/xger.cc @@ -29,7 +29,7 @@ template <> const Precision Xger<double2>::precision_ = Precision::kComplexDoubl // Constructor: forwards to base class constructor template <typename T> -Xger<T>::Xger(Queue &queue, Event &event, const std::string &name): +Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Xger"}, precision_) { source_string_ = #include "../../kernels/level2/level2.opencl" @@ -89,7 +89,7 @@ StatusCode Xger<T>::DoGer(const Layout layout, auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]); auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled}; auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xgerc.cc b/src/routines/level2/xgerc.cc index 09408898..73284b52 100644 --- a/src/routines/level2/xgerc.cc +++ b/src/routines/level2/xgerc.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xgerc<T>::Xgerc(Queue &queue, Event &event, const std::string &name): +Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name): Xger<T>(queue, event, name) { } diff --git a/src/routines/level2/xgeru.cc b/src/routines/level2/xgeru.cc index 36fd9d0a..7730d6a5 100644 --- a/src/routines/level2/xgeru.cc +++ b/src/routines/level2/xgeru.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xgeru<T>::Xgeru(Queue &queue, Event &event, const std::string &name): +Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name): Xger<T>(queue, event, name) { } diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc index f59a7cb3..58591b50 100644 --- a/src/routines/level2/xhbmv.cc +++ b/src/routines/level2/xhbmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xhbmv<T>::Xhbmv(Queue &queue, Event &event, const std::string &name): +Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv<T>(queue, event, name) { } diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc index 5a58b28b..b4ef0fa4 100644 --- a/src/routines/level2/xhemv.cc +++ b/src/routines/level2/xhemv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xhemv<T>::Xhemv(Queue &queue, Event &event, const std::string &name): +Xhemv<T>::Xhemv(Queue &queue, EventPointer event, const std::string &name): Xgemv<T>(queue, event, name) { } diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc index 1aefa240..aba665b0 100644 --- a/src/routines/level2/xher.cc +++ b/src/routines/level2/xher.cc @@ -28,7 +28,7 @@ template <> const Precision Xher<double2, double>::precision_ = Precision::kComp // Constructor: forwards to base class constructor template <typename T, typename U> -Xher<T,U>::Xher(Queue &queue, Event &event, const std::string &name): +Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Xger"}, precision_) { source_string_ = #include "../../kernels/level2/level2.opencl" @@ -99,7 +99,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector<size_t>{global_one, global_two}; auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc index 364add12..bcd6488f 100644 --- a/src/routines/level2/xher2.cc +++ b/src/routines/level2/xher2.cc @@ -28,7 +28,7 @@ template <> const Precision Xher2<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xher2<T>::Xher2(Queue &queue, Event &event, const std::string &name): +Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Xger"}, precision_) { source_string_ = #include "../../kernels/level2/level2.opencl" @@ -91,7 +91,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector<size_t>{global_one, global_two}; auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc index 2269255d..92686dbe 100644 --- a/src/routines/level2/xhpmv.cc +++ b/src/routines/level2/xhpmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xhpmv<T>::Xhpmv(Queue &queue, Event &event, const std::string &name): +Xhpmv<T>::Xhpmv(Queue &queue, EventPointer event, const std::string &name): Xgemv<T>(queue, event, name) { } diff --git a/src/routines/level2/xhpr.cc b/src/routines/level2/xhpr.cc index b0cea72f..4b31ad09 100644 --- a/src/routines/level2/xhpr.cc +++ b/src/routines/level2/xhpr.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T, typename U> -Xhpr<T,U>::Xhpr(Queue &queue, Event &event, const std::string &name): +Xhpr<T,U>::Xhpr(Queue &queue, EventPointer event, const std::string &name): Xher<T,U>(queue, event, name) { } diff --git a/src/routines/level2/xhpr2.cc b/src/routines/level2/xhpr2.cc index ded35e53..9be24f43 100644 --- a/src/routines/level2/xhpr2.cc +++ b/src/routines/level2/xhpr2.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xhpr2<T>::Xhpr2(Queue &queue, Event &event, const std::string &name): +Xhpr2<T>::Xhpr2(Queue &queue, EventPointer event, const std::string &name): Xher2<T>(queue, event, name) { } diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc index 457bd762..bc82c88d 100644 --- a/src/routines/level2/xsbmv.cc +++ b/src/routines/level2/xsbmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xsbmv<T>::Xsbmv(Queue &queue, Event &event, const std::string &name): +Xsbmv<T>::Xsbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv<T>(queue, event, name) { } diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc index 4f1a9c61..6e00dcfa 100644 --- a/src/routines/level2/xspmv.cc +++ b/src/routines/level2/xspmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xspmv<T>::Xspmv(Queue &queue, Event &event, const std::string &name): +Xspmv<T>::Xspmv(Queue &queue, EventPointer event, const std::string &name): Xgemv<T>(queue, event, name) { } diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cc index 2d998e0b..55af2f29 100644 --- a/src/routines/level2/xspr.cc +++ b/src/routines/level2/xspr.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xspr<T>::Xspr(Queue &queue, Event &event, const std::string &name): +Xspr<T>::Xspr(Queue &queue, EventPointer event, const std::string &name): Xher<T,T>(queue, event, name) { } diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cc index fd5232da..9a3f97ce 100644 --- a/src/routines/level2/xspr2.cc +++ b/src/routines/level2/xspr2.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xspr2<T>::Xspr2(Queue &queue, Event &event, const std::string &name): +Xspr2<T>::Xspr2(Queue &queue, EventPointer event, const std::string &name): Xher2<T>(queue, event, name) { } diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc index ec12324b..a9eb284f 100644 --- a/src/routines/level2/xsymv.cc +++ b/src/routines/level2/xsymv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xsymv<T>::Xsymv(Queue &queue, Event &event, const std::string &name): +Xsymv<T>::Xsymv(Queue &queue, EventPointer event, const std::string &name): Xgemv<T>(queue, event, name) { } diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cc index c01fa2d3..4b3928e5 100644 --- a/src/routines/level2/xsyr.cc +++ b/src/routines/level2/xsyr.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xsyr<T>::Xsyr(Queue &queue, Event &event, const std::string &name): +Xsyr<T>::Xsyr(Queue &queue, EventPointer event, const std::string &name): Xher<T,T>(queue, event, name) { } diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cc index 6db55085..3ae389e0 100644 --- a/src/routines/level2/xsyr2.cc +++ b/src/routines/level2/xsyr2.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xsyr2<T>::Xsyr2(Queue &queue, Event &event, const std::string &name): +Xsyr2<T>::Xsyr2(Queue &queue, EventPointer event, const std::string &name): Xher2<T>(queue, event, name) { } diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc index 2e1aebff..47371c87 100644 --- a/src/routines/level2/xtbmv.cc +++ b/src/routines/level2/xtbmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xtbmv<T>::Xtbmv(Queue &queue, Event &event, const std::string &name): +Xtbmv<T>::Xtbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv<T>(queue, event, name) { } diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc index aa0e099b..c63cb9b2 100644 --- a/src/routines/level2/xtpmv.cc +++ b/src/routines/level2/xtpmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xtpmv<T>::Xtpmv(Queue &queue, Event &event, const std::string &name): +Xtpmv<T>::Xtpmv(Queue &queue, EventPointer event, const std::string &name): Xgemv<T>(queue, event, name) { } diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc index 94424743..9111d41d 100644 --- a/src/routines/level2/xtrmv.cc +++ b/src/routines/level2/xtrmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xtrmv<T>::Xtrmv(Queue &queue, Event &event, const std::string &name): +Xtrmv<T>::Xtrmv(Queue &queue, EventPointer event, const std::string &name): Xgemv<T>(queue, event, name) { } diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 5dc2ad7f..7557dcc3 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -29,7 +29,7 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xgemm<T>::Xgemm(Queue &queue, Event &event, const std::string &name): +Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/level3/copy.opencl" @@ -122,30 +122,43 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled); + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, m_ceiled, k_ceiled, m_ceiled, 0, a_temp, program, true, a_do_transpose, a_conjugate); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessA); } // As above, but now for matrix B if (!b_no_temp) { - status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, + auto eventProcessB = Event(); + status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + b_one, b_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, program, true, b_do_transpose, b_conjugate); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessB); } // As above, but now for matrix C. This is only necessary if C is used both as input and output. if (!c_no_temp && beta != static_cast<T>(0)) { - status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + c_one, c_two, c_ld, c_offset, c_buffer, m_ceiled, n_ceiled, m_ceiled, 0, c_temp, program, true, c_do_transpose, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessC); } // Retrieves the Xgemm kernel from the compiled binary @@ -170,12 +183,15 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel - status = RunKernel(kernel, global, local); + auto eventKernel = Event(); + status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel); // Runs the post-processing kernel if needed if (!c_no_temp) { - status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(event_, eventWaitList, + m_ceiled, n_ceiled, m_ceiled, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, program, false, c_do_transpose, false); if (ErrorIn(status)) { return status; } diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc index bcc60dee..c0a4306a 100644 --- a/src/routines/level3/xhemm.cc +++ b/src/routines/level3/xhemm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xhemm<T>::Xhemm(Queue &queue, Event &event, const std::string &name): +Xhemm<T>::Xhemm(Queue &queue, EventPointer event, const std::string &name): Xgemm<T>(queue, event, name) { } @@ -79,9 +79,13 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); + auto kernelEvent = Event(); + status = RunKernel(kernel, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + // Runs the regular Xgemm code with either "C := AB+C" or ... if (side == Side::kLeft) { status = DoGemm(layout, Transpose::kNo, Transpose::kNo, diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index 1711905d..4d5a4d35 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -27,7 +27,7 @@ template <> const Precision Xher2k<double2,double>::precision_ = Precision::kCom // Constructor: forwards to base class constructor template <typename T, typename U> -Xher2k<T,U>::Xher2k(Queue &queue, Event &event, const std::string &name): +Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/level3/copy.opencl" @@ -112,39 +112,58 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a1_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + auto eventProcessA1 = Event(); + status = PadCopyTransposeMatrix(eventProcessA1.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, program, true, ab_rotated, ab_conjugate); + eventWaitList.push_back(eventProcessA1); if (ErrorIn(status)) { return status; } } if (!a2_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + auto eventProcessA2 = Event(); + status = PadCopyTransposeMatrix(eventProcessA2.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, program, true, ab_rotated, !ab_conjugate); + eventWaitList.push_back(eventProcessA2); if (ErrorIn(status)) { return status; } } if (!b1_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + auto eventProcessB1 = Event(); + status = PadCopyTransposeMatrix(eventProcessB1.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, program, true, ab_rotated, ab_conjugate); + eventWaitList.push_back(eventProcessB1); if (ErrorIn(status)) { return status; } } if (!b2_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + auto eventProcessB2 = Event(); + status = PadCopyTransposeMatrix(eventProcessB2.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, program, true, ab_rotated, !ab_conjugate); + eventWaitList.push_back(eventProcessB2); if (ErrorIn(status)) { return status; } } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, program, true, c_rotated, false); + eventWaitList.push_back(eventProcessC); if (ErrorIn(status)) { return status; } // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary @@ -169,8 +188,10 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel - status = RunKernel(kernel, global, local); + auto eventKernel1 = Event(); + status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel1); // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; @@ -181,13 +202,16 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co kernel.SetArgument(5, a2_temp()); // Runs the kernel again - status = RunKernel(kernel, global, local); + auto eventKernel2 = Event(); + status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel2); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, program, false, c_rotated, false, upper, lower, true); if (ErrorIn(status)) { return status; } diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index cbd0a188..574debe4 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -27,7 +27,7 @@ template <> const Precision Xherk<double2,double>::precision_ = Precision::kComp // Constructor: forwards to base class constructor template <typename T, typename U> -Xherk<T,U>::Xherk(Queue &queue, Event &event, const std::string &name): +Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/level3/copy.opencl" @@ -103,27 +103,40 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. Two copies are created. if (!a_no_temp) { - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, program, true, a_rotated, a_conjugate); + eventWaitList.push_back(eventProcessA); if (ErrorIn(status)) { return status; } } if (!b_no_temp) { - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + auto eventProcessB = Event(); + status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, program, true, a_rotated, b_conjugate); + eventWaitList.push_back(eventProcessB); if (ErrorIn(status)) { return status; } } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, program, true, c_rotated, false); + eventWaitList.push_back(eventProcessC); if (ErrorIn(status)) { return status; } // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary @@ -149,13 +162,16 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel - status = RunKernel(kernel, global, local); + auto eventKernel = Event(); + status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, program, false, c_rotated, false, upper, lower, true); if (ErrorIn(status)) { return status; } diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index 583d5c7d..914a326a 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xsymm<T>::Xsymm(Queue &queue, Event &event, const std::string &name): +Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name): Xgemm<T>(queue, event, name) { } @@ -79,9 +79,13 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); + auto kernelEvent = Event(); + status = RunKernel(kernel, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + // Runs the regular Xgemm code with either "C := AB+C" or ... if (side == Side::kLeft) { status = DoGemm(layout, Transpose::kNo, Transpose::kNo, diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 79090871..44d0024e 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -29,7 +29,7 @@ template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDou // Constructor: forwards to base class constructor template <typename T> -Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event, const std::string &name): +Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/level3/copy.opencl" @@ -104,28 +104,41 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, program, true, ab_rotated, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessA); } if (!b_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + auto eventProcessB = Event(); + status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, program, true, ab_rotated, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessB); } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, program, true, c_rotated, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary try { @@ -148,8 +161,10 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel - status = RunKernel(kernel, global, local); + auto eventKernel1 = Event(); + status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel1); // Swaps the arguments for matrices A and B, and sets 'beta' to 1 auto one = static_cast<T>(1); @@ -158,13 +173,16 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons kernel.SetArgument(5, a_temp()); // Runs the kernel again - status = RunKernel(kernel, global, local); + auto eventKernel2 = Event(); + status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel2); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, program, false, c_rotated, false, upper, lower, false); if (ErrorIn(status)) { return status; } diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index ca429bd7..44ed8d35 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -29,7 +29,7 @@ template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xsyrk<T>::Xsyrk(Queue &queue, Event &event, const std::string &name): +Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/level3/copy.opencl" @@ -97,22 +97,32 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector<Event>(); + auto emptyEventList = std::vector<Event>(); + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, program, true, a_rotated, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessA); } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, program, true, c_rotated, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary try { @@ -135,17 +145,21 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel - status = RunKernel(kernel, global, local); + auto eventKernel = Event(); + status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, program, false, c_rotated, false, upper, lower, false); if (ErrorIn(status)) { return status; } + // Successfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 1180c026..484cf040 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xtrmm<T>::Xtrmm(Queue &queue, Event &event, const std::string &name): +Xtrmm<T>::Xtrmm(Queue &queue, EventPointer event, const std::string &name): Xgemm<T>(queue, event, name) { } @@ -82,9 +82,13 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); + auto kernelEvent = Event(); + status = RunKernel(kernel, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + // Runs the regular Xgemm code with either "B := alpha*A*B" or ... if (side == Side::kLeft) { status = DoGemm(layout, a_transpose, Transpose::kNo, |