From 74d6e0048cfcdfd65ab29db47f5b4ffafba0bd51 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 14 Oct 2017 12:23:35 +0200
Subject: Added DAXPY example for the CUDA API

---
 samples/daxpy_cuda.cpp | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++
 samples/sgemm_cuda.cpp |  4 +--
 2 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 samples/daxpy_cuda.cpp

(limited to 'samples')
diff --git a/samples/daxpy_cuda.cpp b/samples/daxpy_cuda.cpp
new file mode 100644
index 00000000..cead3f6d
--- /dev/null
+++ b/samples/daxpy_cuda.cpp
@@ -0,0 +1,88 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the DAXPY routine with the C++ CUDA API of CLBlast.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <chrono>
+#include <vector>
+
+// Includes the CUDA driver API
+#include <cuda.h>
+
+// Includes the CLBlast library
+#include <clblast_cuda.h>
+
+// =================================================================================================
+
+// Example use of the double-precision Xaxpy routine DAXPY
+int main() {
+
+  // CUDA device selection
+  const auto device_id = 0;
+
+  // Example DAXPY arguments
+  const size_t n = 8192;
+  const double alpha = 0.7;
+
+  // Initializes the OpenCL device
+  cuInit(0);
+  CUdevice device;
+  cuDeviceGet(&device, device_id);
+
+  // Creates the OpenCL context and stream
+  CUcontext context;
+  cuCtxCreate(&context, 0, device);
+  CUstream stream;
+  cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING);
+
+  // Populate host matrices with some example data
+  auto host_a = std::vector<double>(n);
+  auto host_b = std::vector<double>(n);
+  for (auto &item: host_a) { item = 12.193; }
+  for (auto &item: host_b) { item = -8.199; }
+
+  // Copy the matrices to the device
+  CUdeviceptr device_a;
+  CUdeviceptr device_b;
+  cuMemAlloc(&device_a, host_a.size()*sizeof(double));
+  cuMemAlloc(&device_b, host_b.size()*sizeof(double));
+  cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(double), stream);
+  cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(double), stream);
+
+  // Start the timer
+  auto start_time = std::chrono::steady_clock::now();
+
+  // Call the DAXPY routine. Note that the type of alpha (double) determines the precision.
+  const auto status = clblast::Axpy(n, alpha,
+                                    device_a, 0, 1,
+                                    device_b, 0, 1,
+                                    context, device);
+  cuStreamSynchronize(stream);
+
+  // Record the execution time
+  auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+  auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
+
+  // Example completed. See "clblast_cuda.h" for status codes (0 -> success).
+  printf("Completed DAXPY in %.3lf ms with status %d\n", time_ms, static_cast<int>(status));
+
+  // Clean-up
+  cuMemFree(device_a);
+  cuMemFree(device_b);
+  cuStreamDestroy(stream);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/sgemm_cuda.cpp b/samples/sgemm_cuda.cpp
index f1138316..8e4397df 100644
--- a/samples/sgemm_cuda.cpp
+++ b/samples/sgemm_cuda.cpp
@@ -69,8 +69,8 @@ int main() {
   cuMemAlloc(&device_b, host_b.size()*sizeof(float));
   cuMemAlloc(&device_c, host_c.size()*sizeof(float));
   cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(float), stream);
-  cuMemcpyHtoDAsync(device_b, host_c.data(), host_b.size()*sizeof(float), stream);
-  cuMemcpyHtoDAsync(device_c, host_b.data(), host_c.size()*sizeof(float), stream);
+  cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(float), stream);
+  cuMemcpyHtoDAsync(device_c, host_c.data(), host_c.size()*sizeof(float), stream);
 
   // Start the timer
   auto start_time = std::chrono::steady_clock::now();
-- 
cgit v1.2.3