diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-14 12:23:35 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-14 12:23:35 +0200 |
commit | 74d6e0048cfcdfd65ab29db47f5b4ffafba0bd51 (patch) | |
tree | 51007b237755b312bd17c6c34428bba8db613034 /samples | |
parent | 54d0c440ce84d61db1b462033052dd0f532a40d8 (diff) |
Added DAXPY example for the CUDA API
Diffstat (limited to 'samples')
-rw-r--r-- | samples/daxpy_cuda.cpp | 88 | ||||
-rw-r--r-- | samples/sgemm_cuda.cpp | 4 |
2 files changed, 90 insertions, 2 deletions
diff --git a/samples/daxpy_cuda.cpp b/samples/daxpy_cuda.cpp new file mode 100644 index 00000000..cead3f6d --- /dev/null +++ b/samples/daxpy_cuda.cpp @@ -0,0 +1,88 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the DAXPY routine with the C++ CUDA API of CLBlast. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <cstdio> +#include <chrono> +#include <vector> + +// Includes the CUDA driver API +#include <cuda.h> + +// Includes the CLBlast library +#include <clblast_cuda.h> + +// ================================================================================================= + +// Example use of the double-precision Xaxpy routine DAXPY +int main() { + + // CUDA device selection + const auto device_id = 0; + + // Example DAXPY arguments + const size_t n = 8192; + const double alpha = 0.7; + + // Initializes the OpenCL device + cuInit(0); + CUdevice device; + cuDeviceGet(&device, device_id); + + // Creates the OpenCL context and stream + CUcontext context; + cuCtxCreate(&context, 0, device); + CUstream stream; + cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING); + + // Populate host matrices with some example data + auto host_a = std::vector<double>(n); + auto host_b = std::vector<double>(n); + for (auto &item: host_a) { item = 12.193; } + for (auto &item: host_b) { item = -8.199; } + + // Copy the matrices to the device + CUdeviceptr device_a; + CUdeviceptr device_b; + cuMemAlloc(&device_a, host_a.size()*sizeof(double)); + cuMemAlloc(&device_b, host_b.size()*sizeof(double)); + cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(double), stream); + cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(double), stream); + + // Start the timer + auto start_time = std::chrono::steady_clock::now(); + + // Call the DAXPY routine. Note that the type of alpha (double) determines the precision. + const auto status = clblast::Axpy(n, alpha, + device_a, 0, 1, + device_b, 0, 1, + context, device); + cuStreamSynchronize(stream); + + // Record the execution time + auto elapsed_time = std::chrono::steady_clock::now() - start_time; + auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count(); + + // Example completed. See "clblast_cuda.h" for status codes (0 -> success). + printf("Completed DAXPY in %.3lf ms with status %d\n", time_ms, static_cast<int>(status)); + + // Clean-up + cuMemFree(device_a); + cuMemFree(device_b); + cuStreamDestroy(stream); + return 0; +} + +// ================================================================================================= diff --git a/samples/sgemm_cuda.cpp b/samples/sgemm_cuda.cpp index f1138316..8e4397df 100644 --- a/samples/sgemm_cuda.cpp +++ b/samples/sgemm_cuda.cpp @@ -69,8 +69,8 @@ int main() { cuMemAlloc(&device_b, host_b.size()*sizeof(float)); cuMemAlloc(&device_c, host_c.size()*sizeof(float)); cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(float), stream); - cuMemcpyHtoDAsync(device_b, host_c.data(), host_b.size()*sizeof(float), stream); - cuMemcpyHtoDAsync(device_c, host_b.data(), host_c.size()*sizeof(float), stream); + cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(float), stream); + cuMemcpyHtoDAsync(device_c, host_c.data(), host_c.size()*sizeof(float), stream); // Start the timer auto start_time = std::chrono::steady_clock::now(); |