Made all CUDA kernel launches synchronous; removed exception raising

author: Cedric Nugteren <web@cedricnugteren.nl> 2017-10-16 21:52:55 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2017-10-16 21:54:42 +0200
commit: 0719f1448655192d2ce6c17ee51c770ef16dd120 (patch)
tree: 062fd7deb36864e4ed34ba17ade03bb1f2697677 /src/cupp11.hpp
parent: d62823f0674df0593836b9b0eb9c0d6e939acf86 (diff)
1 files changed, 9 insertions, 10 deletions
diff --git a/src/cupp11.hpp b/src/cupp11.hpp
index 2a54ef95..1c7e6c9c 100644
--- a/src/cupp11.hpp
+++ b/src/cupp11.hpp
@@ -146,7 +146,7 @@ public:
   }
 
   // Waits for completion of this event (not implemented for CUDA)
-  void WaitForCompletion() const { }
+  void WaitForCompletion() const { }   // not needed due to cuStreamSynchronize call after each kernel launch
 
   // Retrieves the elapsed time of the last recorded event
   float GetElapsedTime() const {
@@ -716,6 +716,10 @@ public:
   // Launches a kernel onto the specified queue
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event) {
+    // TODO: Currently this CUDA launch is always synchronous due to a cuStreamSynchronize call
+    if (local.size() == 0) {
+      throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end");
+    }
 
     // Creates the grid (number of threadblocks) and sets the block sizes (threads per block)
     auto grid = std::vector<size_t>{1, 1, 1};
@@ -734,23 +738,18 @@ public:
     if (event) { CheckError(cuEventRecord(event->start(), queue())); }
     CheckError(cuLaunchKernel(kernel_, grid[0], grid[1], grid[2], block[0], block[1], block[2],
                               0, queue(), pointers.data(), nullptr));
+    cuStreamSynchronize(queue());
     if (event) { CheckError(cuEventRecord(event->end(), queue())); }
   }
 
   // As above, but with an event waiting list
-  // TODO: Implement this function
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event,
               const std::vector<Event>& waitForEvents) {
-    if (local.size() == 0) {
-      throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end");
-    }
-    else if (waitForEvents.size() != 0) {
-      throw LogicError("Kernel: launching with an event waiting list is not implemented for the CUDA back-end");
-    }
-    else {
-      return Launch(queue, global, local, event);
+    for (auto &waitEvent : waitForEvents) {
+      waitEvent.WaitForCompletion(); // note: doesn't do anything, every kernel call is synchronous
     }
+    return Launch(queue, global, local, event);
   }
 
   // Accessors to the private data-members
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-10-16 21:52:55 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2017-10-16 21:54:42 +0200
commit	0719f1448655192d2ce6c17ee51c770ef16dd120 (patch)
tree	062fd7deb36864e4ed34ba17ade03bb1f2697677 /src/cupp11.hpp
parent	d62823f0674df0593836b9b0eb9c0d6e939acf86 (diff)