diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-14 11:43:57 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-14 11:43:57 +0200 |
commit | 54d0c440ce84d61db1b462033052dd0f532a40d8 (patch) | |
tree | b117bb9044a2f6b56428af2fc8e73c27c106ba15 /samples | |
parent | 16b9efd60528ea9230810e6cb6287fe780f02527 (diff) |
Various fixes to make the host code and sample compile with the CUDA API
Diffstat (limited to 'samples')
-rw-r--r-- | samples/sgemm_cuda.cpp | 26 |
1 files changed, 14 insertions, 12 deletions
diff --git a/samples/sgemm_cuda.cpp b/samples/sgemm_cuda.cpp index ed2ad588..f1138316 100644 --- a/samples/sgemm_cuda.cpp +++ b/samples/sgemm_cuda.cpp @@ -19,7 +19,7 @@ #include <vector> // Includes the CUDA driver API -#include <cuda> +#include <cuda.h> // Includes the CLBlast library #include <clblast_cuda.h> @@ -43,14 +43,15 @@ int main() { const auto c_ld = n; // Initializes the OpenCL device + cuInit(0); CUdevice device; cuDeviceGet(&device, device_id); // Creates the OpenCL context and stream CUcontext context; - cuCtxCreate(context, 0, device); + cuCtxCreate(&context, 0, device); CUstream stream; - cuStreamCreate(queue, CU_STREAM_NON_BLOCKING); + cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING); // Populate host matrices with some example data auto host_a = std::vector<float>(m*k); @@ -64,12 +65,12 @@ int main() { CUdeviceptr device_a; CUdeviceptr device_b; CUdeviceptr device_c; - cuMemAlloc(device_a, host_a.size()*sizeof(float)); - cuMemAlloc(device_b, host_b.size()*sizeof(float)); - cuMemAlloc(device_c, host_c.size()*sizeof(float)); - cuMemcpyHtoDAsync(device_a, host_a.data()), host_a.size()*sizeof(T), queue); - cuMemcpyHtoDAsync(device_b, host_c.data()), host_b.size()*sizeof(T), queue); - cuMemcpyHtoDAsync(device_c, host_b.data()), host_c.size()*sizeof(T), queue); + cuMemAlloc(&device_a, host_a.size()*sizeof(float)); + cuMemAlloc(&device_b, host_b.size()*sizeof(float)); + cuMemAlloc(&device_c, host_c.size()*sizeof(float)); + cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(float), stream); + cuMemcpyHtoDAsync(device_b, host_c.data(), host_b.size()*sizeof(float), stream); + cuMemcpyHtoDAsync(device_c, host_b.data(), host_c.size()*sizeof(float), stream); // Start the timer auto start_time = std::chrono::steady_clock::now(); @@ -79,11 +80,12 @@ int main() { clblast::Transpose::kNo, clblast::Transpose::kNo, m, n, k, alpha, - device_a(), 0, a_ld, - device_b(), 0, b_ld, + device_a, 0, a_ld, + device_b, 0, b_ld, beta, - device_c(), 0, c_ld, + device_c, 0, c_ld, context, device); + cuStreamSynchronize(stream); // Record the execution time auto elapsed_time = std::chrono::steady_clock::now() - start_time; |