summaryrefslogtreecommitdiff
path: root/samples
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-10-14 11:43:57 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2017-10-14 11:43:57 +0200
commit54d0c440ce84d61db1b462033052dd0f532a40d8 (patch)
treeb117bb9044a2f6b56428af2fc8e73c27c106ba15 /samples
parent16b9efd60528ea9230810e6cb6287fe780f02527 (diff)
Various fixes to make the host code and sample compile with the CUDA API
Diffstat (limited to 'samples')
-rw-r--r--samples/sgemm_cuda.cpp26
1 files changed, 14 insertions, 12 deletions
diff --git a/samples/sgemm_cuda.cpp b/samples/sgemm_cuda.cpp
index ed2ad588..f1138316 100644
--- a/samples/sgemm_cuda.cpp
+++ b/samples/sgemm_cuda.cpp
@@ -19,7 +19,7 @@
#include <vector>
// Includes the CUDA driver API
-#include <cuda>
+#include <cuda.h>
// Includes the CLBlast library
#include <clblast_cuda.h>
@@ -43,14 +43,15 @@ int main() {
const auto c_ld = n;
// Initializes the OpenCL device
+ cuInit(0);
CUdevice device;
cuDeviceGet(&device, device_id);
// Creates the OpenCL context and stream
CUcontext context;
- cuCtxCreate(context, 0, device);
+ cuCtxCreate(&context, 0, device);
CUstream stream;
- cuStreamCreate(queue, CU_STREAM_NON_BLOCKING);
+ cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING);
// Populate host matrices with some example data
auto host_a = std::vector<float>(m*k);
@@ -64,12 +65,12 @@ int main() {
CUdeviceptr device_a;
CUdeviceptr device_b;
CUdeviceptr device_c;
- cuMemAlloc(device_a, host_a.size()*sizeof(float));
- cuMemAlloc(device_b, host_b.size()*sizeof(float));
- cuMemAlloc(device_c, host_c.size()*sizeof(float));
- cuMemcpyHtoDAsync(device_a, host_a.data()), host_a.size()*sizeof(T), queue);
- cuMemcpyHtoDAsync(device_b, host_c.data()), host_b.size()*sizeof(T), queue);
- cuMemcpyHtoDAsync(device_c, host_b.data()), host_c.size()*sizeof(T), queue);
+ cuMemAlloc(&device_a, host_a.size()*sizeof(float));
+ cuMemAlloc(&device_b, host_b.size()*sizeof(float));
+ cuMemAlloc(&device_c, host_c.size()*sizeof(float));
+ cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(float), stream);
+ cuMemcpyHtoDAsync(device_b, host_c.data(), host_b.size()*sizeof(float), stream);
+ cuMemcpyHtoDAsync(device_c, host_b.data(), host_c.size()*sizeof(float), stream);
// Start the timer
auto start_time = std::chrono::steady_clock::now();
@@ -79,11 +80,12 @@ int main() {
clblast::Transpose::kNo, clblast::Transpose::kNo,
m, n, k,
alpha,
- device_a(), 0, a_ld,
- device_b(), 0, b_ld,
+ device_a, 0, a_ld,
+ device_b, 0, b_ld,
beta,
- device_c(), 0, c_ld,
+ device_c, 0, c_ld,
context, device);
+ cuStreamSynchronize(stream);
// Record the execution time
auto elapsed_time = std::chrono::steady_clock::now() - start_time;