diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-20 12:07:30 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-20 12:07:30 +0200 |
commit | 42dcd8fd8a81c66783827dc4826117b3af610376 (patch) | |
tree | a321cdec1fbb96ec54257b76dccb91184f01b015 /test | |
parent | 48133a0cd1a7b61b87906ec1f4608e766e20a973 (diff) | |
parent | 363568787ebfcdc0c5e6af9c3c8e71c702e2f951 (diff) |
Merge pull request #204 from CNugteren/cuda_api
Cuda API to CLBlast
Diffstat (limited to 'test')
54 files changed, 891 insertions, 450 deletions
diff --git a/test/correctness/routines/level3/xgemm.cpp b/test/correctness/routines/level3/xgemm.cpp index bdf57b36..351e538b 100644 --- a/test/correctness/routines/level3/xgemm.cpp +++ b/test/correctness/routines/level3/xgemm.cpp @@ -15,21 +15,16 @@ // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - - // Tests GEMM based on the 'in-direct' kernel errors += clblast::RunTests<clblast::TestXgemm<1, float>, float, float>(argc, argv, false, "SGEMM"); errors += clblast::RunTests<clblast::TestXgemm<1, double>, double, double>(argc, argv, true, "DGEMM"); errors += clblast::RunTests<clblast::TestXgemm<1, clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM"); errors += clblast::RunTests<clblast::TestXgemm<1, clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM"); errors += clblast::RunTests<clblast::TestXgemm<1, clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGEMM"); - - // Tests GEMM based on the 'direct' kernel errors += clblast::RunTests<clblast::TestXgemm<2, float>, float, float>(argc, argv, true, "SGEMM"); errors += clblast::RunTests<clblast::TestXgemm<2, double>, double, double>(argc, argv, true, "DGEMM"); errors += clblast::RunTests<clblast::TestXgemm<2, clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM"); errors += clblast::RunTests<clblast::TestXgemm<2, clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM"); errors += clblast::RunTests<clblast::TestXgemm<2, clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGEMM"); - if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index 659131c5..aa4b4785 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -241,36 +241,22 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st std::cout << std::flush; } - // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly + // Creates the buffers. Note: we are not using the cxpp11.h C++ version since we explicitly // want to be able to create invalid buffers (no error checking here). - auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); - auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); - auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); - auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); - auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); - auto x_vec1 = Buffer<T>(x1); - auto y_vec1 = Buffer<T>(y1); - auto a_mat1 = Buffer<T>(a1); - auto b_mat1 = Buffer<T>(b1); - auto c_mat1 = Buffer<T>(c1); - auto ap_mat1 = Buffer<T>(ap1); - auto scalar1 = Buffer<T>(d1); - auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); - auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); - auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); - auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); - auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); - auto x_vec2 = Buffer<T>(x2); - auto y_vec2 = Buffer<T>(y2); - auto a_mat2 = Buffer<T>(a2); - auto b_mat2 = Buffer<T>(b2); - auto c_mat2 = Buffer<T>(c2); - auto ap_mat2 = Buffer<T>(ap2); - auto scalar2 = Buffer<T>(d2); + auto x_vec1 = CreateInvalidBuffer<T>(context_, args.x_size); + auto y_vec1 = CreateInvalidBuffer<T>(context_, args.y_size); + auto a_mat1 = CreateInvalidBuffer<T>(context_, args.a_size); + auto b_mat1 = CreateInvalidBuffer<T>(context_, args.b_size); + auto c_mat1 = CreateInvalidBuffer<T>(context_, args.c_size); + auto ap_mat1 = CreateInvalidBuffer<T>(context_, args.ap_size); + auto scalar1 = CreateInvalidBuffer<T>(context_, args.scalar_size); + auto x_vec2 = CreateInvalidBuffer<T>(context_, args.x_size); + auto y_vec2 = CreateInvalidBuffer<T>(context_, args.y_size); + auto a_mat2 = CreateInvalidBuffer<T>(context_, args.a_size); + auto b_mat2 = CreateInvalidBuffer<T>(context_, args.b_size); + auto c_mat2 = CreateInvalidBuffer<T>(context_, args.c_size); + auto ap_mat2 = CreateInvalidBuffer<T>(context_, args.ap_size); + auto scalar2 = CreateInvalidBuffer<T>(context_, args.scalar_size); auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp index caf03787..640f870a 100644 --- a/test/correctness/tester.hpp +++ b/test/correctness/tester.hpp @@ -22,13 +22,13 @@ #include <vector> #include <memory> +#include "utilities/utilities.hpp" #include "test/test_utilities.hpp" // The libraries #ifdef CLBLAST_REF_CLBLAS #include <clBLAS.h> #endif -#include "clblast.h" namespace clblast { // ================================================================================================= diff --git a/test/performance/client.hpp b/test/performance/client.hpp index 2ba09cb9..0b6176c8 100644 --- a/test/performance/client.hpp +++ b/test/performance/client.hpp @@ -32,7 +32,7 @@ #include <clBLAS.h> #endif #include "test/wrapper_cuda.hpp" -#include "clblast.h" +#include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp index 868a79ed..d74807c9 100644 --- a/test/routines/level1/xamax.hpp +++ b/test/routines/level1/xamax.hpp @@ -69,13 +69,21 @@ class TestXamax { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Amax<T>(args.n, - buffers.scalar(), args.imax_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Amax<T>(args.n, + buffers.scalar(), args.imax_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Amax<T>(args.n, + buffers.scalar(), args.imax_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp index 6add9c64..573f1223 100644 --- a/test/routines/level1/xasum.hpp +++ b/test/routines/level1/xasum.hpp @@ -69,13 +69,21 @@ class TestXasum { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; - auto status = Asum<T>(args.n, - buffers.scalar(), args.asum_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + auto status = Asum<T>(args.n, + buffers.scalar(), args.asum_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Asum<T>(args.n, + buffers.scalar(), args.asum_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp index 17cae6ad..7491a9e8 100644 --- a/test/routines/level1/xaxpy.hpp +++ b/test/routines/level1/xaxpy.hpp @@ -70,13 +70,21 @@ class TestXaxpy { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Axpy(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Axpy(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Axpy(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp index 7a5c99b8..58abdbf4 100644 --- a/test/routines/level1/xcopy.hpp +++ b/test/routines/level1/xcopy.hpp @@ -69,13 +69,21 @@ class TestXcopy { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Copy<T>(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Copy<T>(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Copy<T>(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp index 1ea25994..229d18c9 100644 --- a/test/routines/level1/xdot.hpp +++ b/test/routines/level1/xdot.hpp @@ -73,14 +73,23 @@ class TestXdot { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Dot<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Dot<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Dot<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp index c800c1f5..9a1dc33a 100644 --- a/test/routines/level1/xdotc.hpp +++ b/test/routines/level1/xdotc.hpp @@ -73,14 +73,23 @@ class TestXdotc { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Dotc<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Dotc<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Dotc<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp index 3545a3a6..4b2c7647 100644 --- a/test/routines/level1/xdotu.hpp +++ b/test/routines/level1/xdotu.hpp @@ -73,14 +73,23 @@ class TestXdotu { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Dotu<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Dotu<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Dotu<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp index 1db70537..f3a789b5 100644 --- a/test/routines/level1/xnrm2.hpp +++ b/test/routines/level1/xnrm2.hpp @@ -69,13 +69,21 @@ class TestXnrm2 { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Nrm2<T>(args.n, - buffers.scalar(), args.nrm2_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Nrm2<T>(args.n, + buffers.scalar(), args.nrm2_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Nrm2<T>(args.n, + buffers.scalar(), args.nrm2_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp index efa0988d..95038032 100644 --- a/test/routines/level1/xscal.hpp +++ b/test/routines/level1/xscal.hpp @@ -66,12 +66,19 @@ class TestXscal { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Scal(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Scal(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Scal(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp index d778cc23..58310698 100644 --- a/test/routines/level1/xswap.hpp +++ b/test/routines/level1/xswap.hpp @@ -69,13 +69,21 @@ class TestXswap { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Swap<T>(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Swap<T>(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Swap<T>(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp index 23138c77..7c198e5d 100644 --- a/test/routines/level2/xgbmv.hpp +++ b/test/routines/level2/xgbmv.hpp @@ -81,15 +81,25 @@ class TestXgbmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Gbmv(args.layout, args.a_transpose, - args.m, args.n, args.kl, args.ku, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gbmv(args.layout, args.a_transpose, + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Gbmv(args.layout, args.a_transpose, + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp index 0ee53b80..780e2976 100644 --- a/test/routines/level2/xgemv.hpp +++ b/test/routines/level2/xgemv.hpp @@ -81,15 +81,25 @@ class TestXgemv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Gemv(args.layout, args.a_transpose, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gemv(args.layout, args.a_transpose, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Gemv(args.layout, args.a_transpose, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp index 92a1a2ae..9c5e2e40 100644 --- a/test/routines/level2/xger.hpp +++ b/test/routines/level2/xger.hpp @@ -77,15 +77,25 @@ class TestXger { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Ger(args.layout, - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Ger(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Ger(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp index 5d899398..5f58b65d 100644 --- a/test/routines/level2/xgerc.hpp +++ b/test/routines/level2/xgerc.hpp @@ -77,15 +77,25 @@ class TestXgerc { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Gerc(args.layout, - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gerc(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Gerc(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp index 96dab22e..fea3932c 100644 --- a/test/routines/level2/xgeru.hpp +++ b/test/routines/level2/xgeru.hpp @@ -77,15 +77,25 @@ class TestXgeru { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Geru(args.layout, - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Geru(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Geru(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp index b6844744..0ccd69b7 100644 --- a/test/routines/level2/xhbmv.hpp +++ b/test/routines/level2/xhbmv.hpp @@ -75,15 +75,25 @@ class TestXhbmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hbmv(args.layout, args.triangle, - args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp index e1f23592..053bc2dc 100644 --- a/test/routines/level2/xhemv.hpp +++ b/test/routines/level2/xhemv.hpp @@ -75,15 +75,25 @@ class TestXhemv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hemv(args.layout, args.triangle, - args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hemv(args.layout, args.triangle, + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hemv(args.layout, args.triangle, + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp index 1ac1247b..745df43f 100644 --- a/test/routines/level2/xher.hpp +++ b/test/routines/level2/xher.hpp @@ -71,14 +71,23 @@ class TestXher { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Her(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Her(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Her(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp index 18ccc1ac..794e9a1e 100644 --- a/test/routines/level2/xher2.hpp +++ b/test/routines/level2/xher2.hpp @@ -75,15 +75,25 @@ class TestXher2 { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Her2(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Her2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Her2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp index ad91fe15..157272d3 100644 --- a/test/routines/level2/xhpmv.hpp +++ b/test/routines/level2/xhpmv.hpp @@ -75,15 +75,25 @@ class TestXhpmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hpmv(args.layout, args.triangle, - args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hpmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hpmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp index f9d580cd..a3bc60d1 100644 --- a/test/routines/level2/xhpr.hpp +++ b/test/routines/level2/xhpr.hpp @@ -71,14 +71,23 @@ class TestXhpr { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hpr(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hpr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hpr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp index f946ba5c..1aa6cc54 100644 --- a/test/routines/level2/xhpr2.hpp +++ b/test/routines/level2/xhpr2.hpp @@ -75,15 +75,25 @@ class TestXhpr2 { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hpr2(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hpr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hpr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp index 6481d19b..51d6441e 100644 --- a/test/routines/level2/xsbmv.hpp +++ b/test/routines/level2/xsbmv.hpp @@ -75,15 +75,25 @@ class TestXsbmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Sbmv(args.layout, args.triangle, - args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Sbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Sbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp index 9815dbee..f3089836 100644 --- a/test/routines/level2/xspmv.hpp +++ b/test/routines/level2/xspmv.hpp @@ -75,15 +75,25 @@ class TestXspmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Spmv(args.layout, args.triangle, - args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Spmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Spmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp index 01a50c38..d76de610 100644 --- a/test/routines/level2/xspr.hpp +++ b/test/routines/level2/xspr.hpp @@ -71,14 +71,23 @@ class TestXspr { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Spr(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Spr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Spr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp index 55f8a141..5ce82a52 100644 --- a/test/routines/level2/xspr2.hpp +++ b/test/routines/level2/xspr2.hpp @@ -75,15 +75,25 @@ class TestXspr2 { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Spr2(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Spr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Spr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp index aec0dfb0..2a70756d 100644 --- a/test/routines/level2/xsymv.hpp +++ b/test/routines/level2/xsymv.hpp @@ -75,15 +75,25 @@ class TestXsymv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Symv(args.layout, args.triangle, - args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Symv(args.layout, args.triangle, + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Symv(args.layout, args.triangle, + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp index 78b686d8..02aad990 100644 --- a/test/routines/level2/xsyr.hpp +++ b/test/routines/level2/xsyr.hpp @@ -71,14 +71,23 @@ class TestXsyr { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Syr(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Syr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp index 38aa4f43..492a9d2d 100644 --- a/test/routines/level2/xsyr2.hpp +++ b/test/routines/level2/xsyr2.hpp @@ -75,15 +75,25 @@ class TestXsyr2 { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Syr2(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Syr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp index 8c7aa381..587676ca 100644 --- a/test/routines/level2/xtbmv.hpp +++ b/test/routines/level2/xtbmv.hpp @@ -70,14 +70,23 @@ class TestXtbmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, - args.n, args.kl, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, args.kl, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, args.kl, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp index 3afab978..02f334a2 100644 --- a/test/routines/level2/xtpmv.hpp +++ b/test/routines/level2/xtpmv.hpp @@ -70,14 +70,23 @@ class TestXtpmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, - args.n, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp index 2b71f151..4f2dd582 100644 --- a/test/routines/level2/xtrmv.hpp +++ b/test/routines/level2/xtrmv.hpp @@ -70,14 +70,23 @@ class TestXtrmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, - args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp index 85b50e85..aec8eace 100644 --- a/test/routines/level2/xtrsv.hpp +++ b/test/routines/level2/xtrsv.hpp @@ -85,14 +85,23 @@ class TestXtrsv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, - args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp index 1c430c1c..8444c1c3 100644 --- a/test/routines/level3/xgemm.hpp +++ b/test/routines/level3/xgemm.hpp @@ -90,15 +90,25 @@ class TestXgemm { {{"XGEMM_MIN_INDIRECT_SIZE", switch_threshold}}); if (override_status != StatusCode::kSuccess) { return override_status; } } - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, - args.m, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, + args.m, args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, + args.m, args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp index a89617b5..3b70d3f1 100644 --- a/test/routines/level3/xhemm.hpp +++ b/test/routines/level3/xhemm.hpp @@ -83,15 +83,25 @@ class TestXhemm { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hemm(args.layout, args.side, args.triangle, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hemm(args.layout, args.side, args.triangle, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hemm(args.layout, args.side, args.triangle, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp index 55e6d894..6c4e12f1 100644 --- a/test/routines/level3/xher2k.hpp +++ b/test/routines/level3/xher2k.hpp @@ -81,16 +81,26 @@ class TestXher2k { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; - auto status = Her2k(args.layout, args.triangle, args.a_transpose, - args.n, args.k, alpha2, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Her2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, alpha2, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Her2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, alpha2, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp index 3e1e7e02..c1bb7a0b 100644 --- a/test/routines/level3/xherk.hpp +++ b/test/routines/level3/xherk.hpp @@ -74,14 +74,23 @@ class TestXherk { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Herk(args.layout, args.triangle, args.a_transpose, - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Herk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Herk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp index 5d840d40..90cc1888 100644 --- a/test/routines/level3/xsymm.hpp +++ b/test/routines/level3/xsymm.hpp @@ -83,15 +83,25 @@ class TestXsymm { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Symm(args.layout, args.side, args.triangle, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Symm(args.layout, args.side, args.triangle, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Symm(args.layout, args.side, args.triangle, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp index 4a4a2f10..6b29aff7 100644 --- a/test/routines/level3/xsyr2k.hpp +++ b/test/routines/level3/xsyr2k.hpp @@ -81,15 +81,25 @@ class TestXsyr2k { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Syr2k(args.layout, args.triangle, args.a_transpose, - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syr2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Syr2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp index 90e46727..b7782176 100644 --- a/test/routines/level3/xsyrk.hpp +++ b/test/routines/level3/xsyrk.hpp @@ -74,14 +74,23 @@ class TestXsyrk { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Syrk(args.layout, args.triangle, args.a_transpose, - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syrk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Syrk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp index acc00e01..62d0f573 100644 --- a/test/routines/level3/xtrmm.hpp +++ b/test/routines/level3/xtrmm.hpp @@ -74,14 +74,23 @@ class TestXtrmm { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp index d63c9d79..9ce1f09c 100644 --- a/test/routines/level3/xtrsm.hpp +++ b/test/routines/level3/xtrsm.hpp @@ -85,14 +85,23 @@ class TestXtrsm { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp index 4a8fc564..e9715f4e 100644 --- a/test/routines/levelx/xaxpybatched.hpp +++ b/test/routines/levelx/xaxpybatched.hpp @@ -83,14 +83,23 @@ class TestXaxpyBatched { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = AxpyBatched(args.n, args.alphas.data(), - buffers.x_vec(), args.x_offsets.data(), args.x_inc, - buffers.y_vec(), args.y_offsets.data(), args.y_inc, - args.batch_count, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = AxpyBatched(args.n, args.alphas.data(), + buffers.x_vec(), args.x_offsets.data(), args.x_inc, + buffers.y_vec(), args.y_offsets.data(), args.y_inc, + args.batch_count, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = AxpyBatched(args.n, args.alphas.data(), + buffers.x_vec(), args.x_offsets.data(), args.x_inc, + buffers.y_vec(), args.y_offsets.data(), args.y_inc, + args.batch_count, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/levelx/xgemmbatched.hpp b/test/routines/levelx/xgemmbatched.hpp index 704d0578..2a8bd9d4 100644 --- a/test/routines/levelx/xgemmbatched.hpp +++ b/test/routines/levelx/xgemmbatched.hpp @@ -108,8 +108,6 @@ class TestXgemmBatched { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; // Relaxed requirement on ld_a and ld_b within the library, this is here to match clBLAS auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); @@ -119,14 +117,27 @@ class TestXgemmBatched { auto b_one = (!b_rotated) ? args.k : args.n; if (args.a_ld < a_one) { return StatusCode::kInvalidLeadDimA; } if (args.b_ld < b_one) { return StatusCode::kInvalidLeadDimB; } - auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose, - args.m, args.n, args.k, args.alphas.data(), - buffers.a_mat(), args.a_offsets.data(), args.a_ld, - buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(), - buffers.c_mat(), args.c_offsets.data(), args.c_ld, - args.batch_count, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose, + args.m, args.n, args.k, args.alphas.data(), + buffers.a_mat(), args.a_offsets.data(), args.a_ld, + buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(), + buffers.c_mat(), args.c_offsets.data(), args.c_ld, + args.batch_count, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose, + args.m, args.n, args.k, args.alphas.data(), + buffers.a_mat(), args.a_offsets.data(), args.a_ld, + buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(), + buffers.c_mat(), args.c_offsets.data(), args.c_ld, + args.batch_count, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/levelx/xim2col.hpp b/test/routines/levelx/xim2col.hpp index 4124190f..ebffe85e 100644 --- a/test/routines/levelx/xim2col.hpp +++ b/test/routines/levelx/xim2col.hpp @@ -84,17 +84,29 @@ public: // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Im2col<T>(args.channels, args.height, args.width, - args.kernel_h, args.kernel_w, - args.pad_h, args.pad_w, - args.stride_h, args.stride_w, - args.dilation_h, args.dilation_w, - buffers.a_mat(), args.a_offset, - buffers.b_mat(), args.b_offset, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Im2col<T>(args.channels, args.height, args.width, + args.kernel_h, args.kernel_w, + args.pad_h, args.pad_w, + args.stride_h, args.stride_w, + args.dilation_h, args.dilation_w, + buffers.a_mat(), args.a_offset, + buffers.b_mat(), args.b_offset, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Im2col<T>(args.channels, args.height, args.width, + args.kernel_h, args.kernel_w, + args.pad_h, args.pad_w, + args.stride_h, args.stride_w, + args.dilation_h, args.dilation_w, + buffers.a_mat(), args.a_offset, + buffers.b_mat(), args.b_offset, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp index cc02a88b..3df1e2b0 100644 --- a/test/routines/levelx/xinvert.hpp +++ b/test/routines/levelx/xinvert.hpp @@ -164,14 +164,23 @@ class TestXinvert { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { try { - auto event = cl_event{}; - auto inverter = Xinvert<T>(queue, &event); - inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, - args.n, args.m, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.b_mat); - clWaitForEvents(1, &event); - clReleaseEvent(event); + #ifdef OPENCL_API + auto event = cl_event{}; + auto inverter = Xinvert<T>(queue, &event); + inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, + args.n, args.m, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat); + clWaitForEvents(1, &event); + clReleaseEvent(event); + #elif CUDA_API + auto inverter = Xinvert<T>(queue, nullptr); + inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, + args.n, args.m, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat); + cuStreamSynchronize(queue()); + #endif } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp index 2736cf75..70bda452 100644 --- a/test/routines/levelx/xomatcopy.hpp +++ b/test/routines/levelx/xomatcopy.hpp @@ -126,14 +126,23 @@ class TestXomatcopy { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Omatcopy<T>(args.layout, args.a_transpose, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Omatcopy<T>(args.layout, args.a_transpose, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Omatcopy<T>(args.layout, args.a_transpose, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/test_utilities.cpp b/test/test_utilities.cpp index 579eb61c..84f8894f 100644 --- a/test/test_utilities.cpp +++ b/test/test_utilities.cpp @@ -88,27 +88,29 @@ void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& sour } // As above, but now for OpenCL data-types instead of std::vectors -Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, RawCommandQueue queue_raw) { - const auto size = source.GetSize() / sizeof(half); - auto queue = Queue(queue_raw); - auto context = queue.GetContext(); - auto source_cpu = std::vector<half>(size); - source.Read(queue, size, source_cpu); - auto result_cpu = HalfToFloatBuffer(source_cpu); - auto result = Buffer<float>(context, size); - result.Write(queue, size, result_cpu); - return result; -} -void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, RawCommandQueue queue_raw) { - const auto size = source.GetSize() / sizeof(float); - auto queue = Queue(queue_raw); - auto context = queue.GetContext(); - auto source_cpu = std::vector<float>(size); - source.Read(queue, size, source_cpu); - auto result_cpu = std::vector<half>(size); - FloatToHalfBuffer(result_cpu, source_cpu); - result.Write(queue, size, result_cpu); -} +#ifdef OPENCL_API + Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, RawCommandQueue queue_raw) { + const auto size = source.GetSize() / sizeof(half); + auto queue = Queue(queue_raw); + auto context = queue.GetContext(); + auto source_cpu = std::vector<half>(size); + source.Read(queue, size, source_cpu); + auto result_cpu = HalfToFloatBuffer(source_cpu); + auto result = Buffer<float>(context, size); + result.Write(queue, size, result_cpu); + return result; + } + void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, RawCommandQueue queue_raw) { + const auto size = source.GetSize() / sizeof(float); + auto queue = Queue(queue_raw); + auto context = queue.GetContext(); + auto source_cpu = std::vector<float>(size); + source.Read(queue, size, source_cpu); + auto result_cpu = std::vector<half>(size); + FloatToHalfBuffer(result_cpu, source_cpu); + result.Write(queue, size, result_cpu); + } +#endif // ================================================================================================= } // namespace clblast diff --git a/test/test_utilities.hpp b/test/test_utilities.hpp index fe7a9cd2..d03c55fc 100644 --- a/test/test_utilities.hpp +++ b/test/test_utilities.hpp @@ -89,8 +89,25 @@ std::vector<float> HalfToFloatBuffer(const std::vector<half>& source); void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source); // As above, but now for OpenCL data-types instead of std::vectors -Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, RawCommandQueue queue_raw); -void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, RawCommandQueue queue_raw); +#ifdef OPENCL_API + Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, RawCommandQueue queue_raw); + void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, RawCommandQueue queue_raw); +#endif + +// ================================================================================================= + +// Creates a buffer but don't test for validity. That's the reason this is not using the clpp11.h or +// cupp11.h interface. +template <typename T> +Buffer<T> CreateInvalidBuffer(const Context& context, const size_t size) { + #ifdef OPENCL_API + auto raw_buffer = clCreateBuffer(context(), CL_MEM_READ_WRITE, size * sizeof(T), nullptr, nullptr); + #elif CUDA_API + CUdeviceptr raw_buffer; + cuMemAlloc(&raw_buffer, size * sizeof(T)); + #endif + return Buffer<T>(raw_buffer); +} // ================================================================================================= } // namespace clblast diff --git a/test/wrapper_cuda.hpp b/test/wrapper_cuda.hpp index c97ae3ef..12417cdd 100644 --- a/test/wrapper_cuda.hpp +++ b/test/wrapper_cuda.hpp @@ -22,6 +22,7 @@ #include "utilities/utilities.hpp" #ifdef CLBLAST_REF_CUBLAS + #define CUDA_NO_HALF #include <cuda_runtime.h> #include <cublas_v2.h> #endif |