diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-28 17:32:37 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-28 17:32:37 +0200 |
commit | 12b08ae49154379f7471a40809ace6418857b387 (patch) | |
tree | ef958197db0bb8a67c9a5840f828b3f6c72bd8fc /test | |
parent | 2949e156f5bfdd724987e67477da3e3608e4aaf9 (diff) | |
parent | fa6e5e67f585b77d34c3031c176de9a0f7904aa9 (diff) |
Merge branch 'master' into android_support
Diffstat (limited to 'test')
57 files changed, 925 insertions, 458 deletions
diff --git a/test/correctness/misc/override_parameters.cpp b/test/correctness/misc/override_parameters.cpp index 535d9286..05f40f57 100644 --- a/test/correctness/misc/override_parameters.cpp +++ b/test/correctness/misc/override_parameters.cpp @@ -28,7 +28,7 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st auto arguments = RetrieveCommandLineArguments(argc, argv); auto errors = size_t{0}; auto passed = size_t{0}; - auto example_routine = TestXgemm<T>(); + auto example_routine = TestXgemm<0, T>(); constexpr auto kSeed = 42; // fixed seed for reproducibility // Determines the test settings @@ -37,6 +37,7 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st const auto valid_settings = std::vector<std::unordered_map<std::string,size_t>>{ { {"KWG",16}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} }, { {"KWG",32}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",32}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} }, + { {"KWG",16}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} }, }; const auto invalid_settings = std::vector<std::unordered_map<std::string,size_t>>{ { {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0} }, diff --git a/test/correctness/routines/level3/xgemm.cpp b/test/correctness/routines/level3/xgemm.cpp index 5de73554..351e538b 100644 --- a/test/correctness/routines/level3/xgemm.cpp +++ b/test/correctness/routines/level3/xgemm.cpp @@ -15,11 +15,16 @@ // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { auto errors = size_t{0}; - errors += clblast::RunTests<clblast::TestXgemm<float>, float, float>(argc, argv, false, "SGEMM"); - errors += clblast::RunTests<clblast::TestXgemm<double>, double, double>(argc, argv, true, "DGEMM"); - errors += clblast::RunTests<clblast::TestXgemm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM"); - errors += clblast::RunTests<clblast::TestXgemm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM"); - errors += clblast::RunTests<clblast::TestXgemm<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<1, float>, float, float>(argc, argv, false, "SGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<1, double>, double, double>(argc, argv, true, "DGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<1, clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<1, clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<1, clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<2, float>, float, float>(argc, argv, true, "SGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<2, double>, double, double>(argc, argv, true, "DGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<2, clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<2, clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM"); + errors += clblast::RunTests<clblast::TestXgemm<2, clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGEMM"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index 659131c5..aa4b4785 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -241,36 +241,22 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st std::cout << std::flush; } - // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly + // Creates the buffers. Note: we are not using the cxpp11.h C++ version since we explicitly // want to be able to create invalid buffers (no error checking here). - auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); - auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); - auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); - auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); - auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); - auto x_vec1 = Buffer<T>(x1); - auto y_vec1 = Buffer<T>(y1); - auto a_mat1 = Buffer<T>(a1); - auto b_mat1 = Buffer<T>(b1); - auto c_mat1 = Buffer<T>(c1); - auto ap_mat1 = Buffer<T>(ap1); - auto scalar1 = Buffer<T>(d1); - auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); - auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); - auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); - auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); - auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); - auto x_vec2 = Buffer<T>(x2); - auto y_vec2 = Buffer<T>(y2); - auto a_mat2 = Buffer<T>(a2); - auto b_mat2 = Buffer<T>(b2); - auto c_mat2 = Buffer<T>(c2); - auto ap_mat2 = Buffer<T>(ap2); - auto scalar2 = Buffer<T>(d2); + auto x_vec1 = CreateInvalidBuffer<T>(context_, args.x_size); + auto y_vec1 = CreateInvalidBuffer<T>(context_, args.y_size); + auto a_mat1 = CreateInvalidBuffer<T>(context_, args.a_size); + auto b_mat1 = CreateInvalidBuffer<T>(context_, args.b_size); + auto c_mat1 = CreateInvalidBuffer<T>(context_, args.c_size); + auto ap_mat1 = CreateInvalidBuffer<T>(context_, args.ap_size); + auto scalar1 = CreateInvalidBuffer<T>(context_, args.scalar_size); + auto x_vec2 = CreateInvalidBuffer<T>(context_, args.x_size); + auto y_vec2 = CreateInvalidBuffer<T>(context_, args.y_size); + auto a_mat2 = CreateInvalidBuffer<T>(context_, args.a_size); + auto b_mat2 = CreateInvalidBuffer<T>(context_, args.b_size); + auto c_mat2 = CreateInvalidBuffer<T>(context_, args.c_size); + auto ap_mat2 = CreateInvalidBuffer<T>(context_, args.ap_size); + auto scalar2 = CreateInvalidBuffer<T>(context_, args.scalar_size); auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp index caf03787..640f870a 100644 --- a/test/correctness/tester.hpp +++ b/test/correctness/tester.hpp @@ -22,13 +22,13 @@ #include <vector> #include <memory> +#include "utilities/utilities.hpp" #include "test/test_utilities.hpp" // The libraries #ifdef CLBLAST_REF_CLBLAS #include <clBLAS.h> #endif -#include "clblast.h" namespace clblast { // ================================================================================================= diff --git a/test/diagnostics.cpp b/test/diagnostics.cpp index 99b936f8..af56cd30 100644 --- a/test/diagnostics.cpp +++ b/test/diagnostics.cpp @@ -85,7 +85,7 @@ void OpenCLDiagnostics(int argc, char *argv[]) { printf("* device.Name() %.4lf ms\n", TimeFunction(kNumRuns, [&](){device.Name();} )); printf("* device.Vendor() %.4lf ms\n", TimeFunction(kNumRuns, [&](){device.Vendor();} )); printf("* device.Version() %.4lf ms\n", TimeFunction(kNumRuns, [&](){device.Version();} )); - printf("* device.Platform() %.4lf ms\n", TimeFunction(kNumRuns, [&](){device.Platform();} )); + printf("* device.Platform() %.4lf ms\n", TimeFunction(kNumRuns, [&](){ device.PlatformID();} )); printf("* Buffer<float>(context, 1024) %.4lf ms\n", TimeFunction(kNumRuns, [&](){Buffer<float>(context, 1024);} )); printf("\n"); diff --git a/test/performance/client.hpp b/test/performance/client.hpp index 2ba09cb9..0b6176c8 100644 --- a/test/performance/client.hpp +++ b/test/performance/client.hpp @@ -32,7 +32,7 @@ #include <clBLAS.h> #endif #include "test/wrapper_cuda.hpp" -#include "clblast.h" +#include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= diff --git a/test/performance/routines/level3/xgemm.cpp b/test/performance/routines/level3/xgemm.cpp index 5b3426f5..0b67b4d3 100644 --- a/test/performance/routines/level3/xgemm.cpp +++ b/test/performance/routines/level3/xgemm.cpp @@ -17,15 +17,15 @@ int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { case clblast::Precision::kHalf: - clblast::RunClient<clblast::TestXgemm<clblast::half>, clblast::half, clblast::half>(argc, argv); break; + clblast::RunClient<clblast::TestXgemm<0, clblast::half>, clblast::half, clblast::half>(argc, argv); break; case clblast::Precision::kSingle: - clblast::RunClient<clblast::TestXgemm<float>, float, float>(argc, argv); break; + clblast::RunClient<clblast::TestXgemm<0, float>, float, float>(argc, argv); break; case clblast::Precision::kDouble: - clblast::RunClient<clblast::TestXgemm<double>, double, double>(argc, argv); break; + clblast::RunClient<clblast::TestXgemm<0, double>, double, double>(argc, argv); break; case clblast::Precision::kComplexSingle: - clblast::RunClient<clblast::TestXgemm<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; + clblast::RunClient<clblast::TestXgemm<0, clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break; case clblast::Precision::kComplexDouble: - clblast::RunClient<clblast::TestXgemm<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; + clblast::RunClient<clblast::TestXgemm<0, clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break; } return 0; } diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp index 868a79ed..d74807c9 100644 --- a/test/routines/level1/xamax.hpp +++ b/test/routines/level1/xamax.hpp @@ -69,13 +69,21 @@ class TestXamax { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Amax<T>(args.n, - buffers.scalar(), args.imax_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Amax<T>(args.n, + buffers.scalar(), args.imax_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Amax<T>(args.n, + buffers.scalar(), args.imax_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp index 6add9c64..573f1223 100644 --- a/test/routines/level1/xasum.hpp +++ b/test/routines/level1/xasum.hpp @@ -69,13 +69,21 @@ class TestXasum { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + #ifdef OPENCL_API auto queue_plain = queue(); auto event = cl_event{}; - auto status = Asum<T>(args.n, - buffers.scalar(), args.asum_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + auto status = Asum<T>(args.n, + buffers.scalar(), args.asum_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Asum<T>(args.n, + buffers.scalar(), args.asum_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp index 17cae6ad..7491a9e8 100644 --- a/test/routines/level1/xaxpy.hpp +++ b/test/routines/level1/xaxpy.hpp @@ -70,13 +70,21 @@ class TestXaxpy { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Axpy(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Axpy(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Axpy(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp index 7a5c99b8..58abdbf4 100644 --- a/test/routines/level1/xcopy.hpp +++ b/test/routines/level1/xcopy.hpp @@ -69,13 +69,21 @@ class TestXcopy { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Copy<T>(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Copy<T>(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Copy<T>(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp index 1ea25994..229d18c9 100644 --- a/test/routines/level1/xdot.hpp +++ b/test/routines/level1/xdot.hpp @@ -73,14 +73,23 @@ class TestXdot { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Dot<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Dot<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Dot<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp index c800c1f5..9a1dc33a 100644 --- a/test/routines/level1/xdotc.hpp +++ b/test/routines/level1/xdotc.hpp @@ -73,14 +73,23 @@ class TestXdotc { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Dotc<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Dotc<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Dotc<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp index 3545a3a6..4b2c7647 100644 --- a/test/routines/level1/xdotu.hpp +++ b/test/routines/level1/xdotu.hpp @@ -73,14 +73,23 @@ class TestXdotu { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Dotu<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Dotu<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Dotu<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp index 1db70537..f3a789b5 100644 --- a/test/routines/level1/xnrm2.hpp +++ b/test/routines/level1/xnrm2.hpp @@ -69,13 +69,21 @@ class TestXnrm2 { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Nrm2<T>(args.n, - buffers.scalar(), args.nrm2_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Nrm2<T>(args.n, + buffers.scalar(), args.nrm2_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Nrm2<T>(args.n, + buffers.scalar(), args.nrm2_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp index efa0988d..95038032 100644 --- a/test/routines/level1/xscal.hpp +++ b/test/routines/level1/xscal.hpp @@ -66,12 +66,19 @@ class TestXscal { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Scal(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Scal(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Scal(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp index d778cc23..58310698 100644 --- a/test/routines/level1/xswap.hpp +++ b/test/routines/level1/xswap.hpp @@ -69,13 +69,21 @@ class TestXswap { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Swap<T>(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Swap<T>(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Swap<T>(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp index 23138c77..7c198e5d 100644 --- a/test/routines/level2/xgbmv.hpp +++ b/test/routines/level2/xgbmv.hpp @@ -81,15 +81,25 @@ class TestXgbmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Gbmv(args.layout, args.a_transpose, - args.m, args.n, args.kl, args.ku, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gbmv(args.layout, args.a_transpose, + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Gbmv(args.layout, args.a_transpose, + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp index 0ee53b80..780e2976 100644 --- a/test/routines/level2/xgemv.hpp +++ b/test/routines/level2/xgemv.hpp @@ -81,15 +81,25 @@ class TestXgemv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Gemv(args.layout, args.a_transpose, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gemv(args.layout, args.a_transpose, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Gemv(args.layout, args.a_transpose, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp index 92a1a2ae..9c5e2e40 100644 --- a/test/routines/level2/xger.hpp +++ b/test/routines/level2/xger.hpp @@ -77,15 +77,25 @@ class TestXger { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Ger(args.layout, - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Ger(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Ger(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp index 5d899398..5f58b65d 100644 --- a/test/routines/level2/xgerc.hpp +++ b/test/routines/level2/xgerc.hpp @@ -77,15 +77,25 @@ class TestXgerc { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Gerc(args.layout, - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gerc(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Gerc(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp index 96dab22e..fea3932c 100644 --- a/test/routines/level2/xgeru.hpp +++ b/test/routines/level2/xgeru.hpp @@ -77,15 +77,25 @@ class TestXgeru { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Geru(args.layout, - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Geru(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Geru(args.layout, + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp index b6844744..0ccd69b7 100644 --- a/test/routines/level2/xhbmv.hpp +++ b/test/routines/level2/xhbmv.hpp @@ -75,15 +75,25 @@ class TestXhbmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hbmv(args.layout, args.triangle, - args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp index e1f23592..053bc2dc 100644 --- a/test/routines/level2/xhemv.hpp +++ b/test/routines/level2/xhemv.hpp @@ -75,15 +75,25 @@ class TestXhemv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hemv(args.layout, args.triangle, - args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hemv(args.layout, args.triangle, + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hemv(args.layout, args.triangle, + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp index 1ac1247b..745df43f 100644 --- a/test/routines/level2/xher.hpp +++ b/test/routines/level2/xher.hpp @@ -71,14 +71,23 @@ class TestXher { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Her(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Her(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Her(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp index 18ccc1ac..794e9a1e 100644 --- a/test/routines/level2/xher2.hpp +++ b/test/routines/level2/xher2.hpp @@ -75,15 +75,25 @@ class TestXher2 { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Her2(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Her2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Her2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp index ad91fe15..157272d3 100644 --- a/test/routines/level2/xhpmv.hpp +++ b/test/routines/level2/xhpmv.hpp @@ -75,15 +75,25 @@ class TestXhpmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hpmv(args.layout, args.triangle, - args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hpmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hpmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp index f9d580cd..a3bc60d1 100644 --- a/test/routines/level2/xhpr.hpp +++ b/test/routines/level2/xhpr.hpp @@ -71,14 +71,23 @@ class TestXhpr { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hpr(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hpr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hpr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp index f946ba5c..1aa6cc54 100644 --- a/test/routines/level2/xhpr2.hpp +++ b/test/routines/level2/xhpr2.hpp @@ -75,15 +75,25 @@ class TestXhpr2 { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hpr2(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hpr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hpr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp index 6481d19b..51d6441e 100644 --- a/test/routines/level2/xsbmv.hpp +++ b/test/routines/level2/xsbmv.hpp @@ -75,15 +75,25 @@ class TestXsbmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Sbmv(args.layout, args.triangle, - args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Sbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Sbmv(args.layout, args.triangle, + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp index 9815dbee..f3089836 100644 --- a/test/routines/level2/xspmv.hpp +++ b/test/routines/level2/xspmv.hpp @@ -75,15 +75,25 @@ class TestXspmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Spmv(args.layout, args.triangle, - args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Spmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Spmv(args.layout, args.triangle, + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp index 01a50c38..d76de610 100644 --- a/test/routines/level2/xspr.hpp +++ b/test/routines/level2/xspr.hpp @@ -71,14 +71,23 @@ class TestXspr { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Spr(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Spr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Spr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp index 55f8a141..5ce82a52 100644 --- a/test/routines/level2/xspr2.hpp +++ b/test/routines/level2/xspr2.hpp @@ -75,15 +75,25 @@ class TestXspr2 { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Spr2(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Spr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Spr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp index aec0dfb0..2a70756d 100644 --- a/test/routines/level2/xsymv.hpp +++ b/test/routines/level2/xsymv.hpp @@ -75,15 +75,25 @@ class TestXsymv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Symv(args.layout, args.triangle, - args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Symv(args.layout, args.triangle, + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Symv(args.layout, args.triangle, + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp index 78b686d8..02aad990 100644 --- a/test/routines/level2/xsyr.hpp +++ b/test/routines/level2/xsyr.hpp @@ -71,14 +71,23 @@ class TestXsyr { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Syr(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Syr(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp index 38aa4f43..492a9d2d 100644 --- a/test/routines/level2/xsyr2.hpp +++ b/test/routines/level2/xsyr2.hpp @@ -75,15 +75,25 @@ class TestXsyr2 { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Syr2(args.layout, args.triangle, - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Syr2(args.layout, args.triangle, + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp index 8c7aa381..587676ca 100644 --- a/test/routines/level2/xtbmv.hpp +++ b/test/routines/level2/xtbmv.hpp @@ -70,14 +70,23 @@ class TestXtbmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, - args.n, args.kl, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, args.kl, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, args.kl, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp index 3afab978..02f334a2 100644 --- a/test/routines/level2/xtpmv.hpp +++ b/test/routines/level2/xtpmv.hpp @@ -70,14 +70,23 @@ class TestXtpmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, - args.n, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp index 2b71f151..4f2dd582 100644 --- a/test/routines/level2/xtrmv.hpp +++ b/test/routines/level2/xtrmv.hpp @@ -70,14 +70,23 @@ class TestXtrmv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, - args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp index 85b50e85..aec8eace 100644 --- a/test/routines/level2/xtrsv.hpp +++ b/test/routines/level2/xtrsv.hpp @@ -85,14 +85,23 @@ class TestXtrsv { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, - args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp index 7e0ead6d..8444c1c3 100644 --- a/test/routines/level3/xgemm.hpp +++ b/test/routines/level3/xgemm.hpp @@ -22,7 +22,7 @@ namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class -template <typename T> +template <int V, typename T> // 'V' is the version of the kernel (0 for default, 1 for 'in-direct', 2 for 'direct') class TestXgemm { public: @@ -83,15 +83,32 @@ class TestXgemm { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, - args.m, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + if (V != 0) { + const auto device = queue.GetDevice(); + const auto switch_threshold = (V == 1) ? size_t{0} : size_t{1024 * 1024 * 1024}; // large enough for tests + const auto override_status = OverrideParameters(device(), "KernelSelection", PrecisionValue<T>(), + {{"XGEMM_MIN_INDIRECT_SIZE", switch_threshold}}); + if (override_status != StatusCode::kSuccess) { return override_status; } + } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, + args.m, args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, + args.m, args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp index a89617b5..3b70d3f1 100644 --- a/test/routines/level3/xhemm.hpp +++ b/test/routines/level3/xhemm.hpp @@ -83,15 +83,25 @@ class TestXhemm { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Hemm(args.layout, args.side, args.triangle, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Hemm(args.layout, args.side, args.triangle, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Hemm(args.layout, args.side, args.triangle, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp index 55e6d894..6c4e12f1 100644 --- a/test/routines/level3/xher2k.hpp +++ b/test/routines/level3/xher2k.hpp @@ -81,16 +81,26 @@ class TestXher2k { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; - auto status = Her2k(args.layout, args.triangle, args.a_transpose, - args.n, args.k, alpha2, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Her2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, alpha2, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Her2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, alpha2, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp index 3e1e7e02..c1bb7a0b 100644 --- a/test/routines/level3/xherk.hpp +++ b/test/routines/level3/xherk.hpp @@ -74,14 +74,23 @@ class TestXherk { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Herk(args.layout, args.triangle, args.a_transpose, - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Herk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Herk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp index 5d840d40..90cc1888 100644 --- a/test/routines/level3/xsymm.hpp +++ b/test/routines/level3/xsymm.hpp @@ -83,15 +83,25 @@ class TestXsymm { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Symm(args.layout, args.side, args.triangle, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Symm(args.layout, args.side, args.triangle, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Symm(args.layout, args.side, args.triangle, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp index 4a4a2f10..6b29aff7 100644 --- a/test/routines/level3/xsyr2k.hpp +++ b/test/routines/level3/xsyr2k.hpp @@ -81,15 +81,25 @@ class TestXsyr2k { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Syr2k(args.layout, args.triangle, args.a_transpose, - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syr2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Syr2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp index 90e46727..b7782176 100644 --- a/test/routines/level3/xsyrk.hpp +++ b/test/routines/level3/xsyrk.hpp @@ -74,14 +74,23 @@ class TestXsyrk { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Syrk(args.layout, args.triangle, args.a_transpose, - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syrk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Syrk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp index acc00e01..62d0f573 100644 --- a/test/routines/level3/xtrmm.hpp +++ b/test/routines/level3/xtrmm.hpp @@ -74,14 +74,23 @@ class TestXtrmm { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp index d63c9d79..9ce1f09c 100644 --- a/test/routines/level3/xtrsm.hpp +++ b/test/routines/level3/xtrsm.hpp @@ -85,14 +85,23 @@ class TestXtrsm { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp index 4a8fc564..e9715f4e 100644 --- a/test/routines/levelx/xaxpybatched.hpp +++ b/test/routines/levelx/xaxpybatched.hpp @@ -83,14 +83,23 @@ class TestXaxpyBatched { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = AxpyBatched(args.n, args.alphas.data(), - buffers.x_vec(), args.x_offsets.data(), args.x_inc, - buffers.y_vec(), args.y_offsets.data(), args.y_inc, - args.batch_count, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = AxpyBatched(args.n, args.alphas.data(), + buffers.x_vec(), args.x_offsets.data(), args.x_inc, + buffers.y_vec(), args.y_offsets.data(), args.y_inc, + args.batch_count, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = AxpyBatched(args.n, args.alphas.data(), + buffers.x_vec(), args.x_offsets.data(), args.x_inc, + buffers.y_vec(), args.y_offsets.data(), args.y_inc, + args.batch_count, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/levelx/xgemmbatched.hpp b/test/routines/levelx/xgemmbatched.hpp index 704d0578..2a8bd9d4 100644 --- a/test/routines/levelx/xgemmbatched.hpp +++ b/test/routines/levelx/xgemmbatched.hpp @@ -108,8 +108,6 @@ class TestXgemmBatched { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; // Relaxed requirement on ld_a and ld_b within the library, this is here to match clBLAS auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); @@ -119,14 +117,27 @@ class TestXgemmBatched { auto b_one = (!b_rotated) ? args.k : args.n; if (args.a_ld < a_one) { return StatusCode::kInvalidLeadDimA; } if (args.b_ld < b_one) { return StatusCode::kInvalidLeadDimB; } - auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose, - args.m, args.n, args.k, args.alphas.data(), - buffers.a_mat(), args.a_offsets.data(), args.a_ld, - buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(), - buffers.c_mat(), args.c_offsets.data(), args.c_ld, - args.batch_count, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose, + args.m, args.n, args.k, args.alphas.data(), + buffers.a_mat(), args.a_offsets.data(), args.a_ld, + buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(), + buffers.c_mat(), args.c_offsets.data(), args.c_ld, + args.batch_count, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose, + args.m, args.n, args.k, args.alphas.data(), + buffers.a_mat(), args.a_offsets.data(), args.a_ld, + buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(), + buffers.c_mat(), args.c_offsets.data(), args.c_ld, + args.batch_count, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/levelx/xim2col.hpp b/test/routines/levelx/xim2col.hpp index 4124190f..ebffe85e 100644 --- a/test/routines/levelx/xim2col.hpp +++ b/test/routines/levelx/xim2col.hpp @@ -84,17 +84,29 @@ public: // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Im2col<T>(args.channels, args.height, args.width, - args.kernel_h, args.kernel_w, - args.pad_h, args.pad_w, - args.stride_h, args.stride_w, - args.dilation_h, args.dilation_w, - buffers.a_mat(), args.a_offset, - buffers.b_mat(), args.b_offset, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Im2col<T>(args.channels, args.height, args.width, + args.kernel_h, args.kernel_w, + args.pad_h, args.pad_w, + args.stride_h, args.stride_w, + args.dilation_h, args.dilation_w, + buffers.a_mat(), args.a_offset, + buffers.b_mat(), args.b_offset, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Im2col<T>(args.channels, args.height, args.width, + args.kernel_h, args.kernel_w, + args.pad_h, args.pad_w, + args.stride_h, args.stride_w, + args.dilation_h, args.dilation_w, + buffers.a_mat(), args.a_offset, + buffers.b_mat(), args.b_offset, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp index cc02a88b..b8503029 100644 --- a/test/routines/levelx/xinvert.hpp +++ b/test/routines/levelx/xinvert.hpp @@ -17,6 +17,7 @@ #define CLBLAST_TEST_ROUTINES_XINVERT_H_ #include "test/routines/common.hpp" +#include "src/routines/levelx/xinvert.hpp" namespace clblast { // ================================================================================================= @@ -40,6 +41,13 @@ StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) return StatusCode::kUnknownError; } + // Start at zero + for (size_t i =0; i < args.m; ++i) { + for (size_t j = 0; j < args.n; ++j) { + buffers_host.b_mat[j * args.m + i] = T{0.0}; + } + } + // Loops over the amount of diagonal blocks of size args.m by args.m each for (auto block_id = size_t{0}; block_id < num_blocks; ++block_id) { const auto a_offset = block_id * (block_size + a_ld * block_size) + args.a_offset; @@ -164,14 +172,23 @@ class TestXinvert { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { try { - auto event = cl_event{}; - auto inverter = Xinvert<T>(queue, &event); - inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, - args.n, args.m, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.b_mat); - clWaitForEvents(1, &event); - clReleaseEvent(event); + #ifdef OPENCL_API + auto event = cl_event{}; + auto inverter = Xinvert<T>(queue, &event); + inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, + args.n, args.m, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat); + clWaitForEvents(1, &event); + clReleaseEvent(event); + #elif CUDA_API + auto inverter = Xinvert<T>(queue, nullptr); + inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, + args.n, args.m, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat); + cuStreamSynchronize(queue()); + #endif } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp index 2736cf75..70bda452 100644 --- a/test/routines/levelx/xomatcopy.hpp +++ b/test/routines/levelx/xomatcopy.hpp @@ -126,14 +126,23 @@ class TestXomatcopy { // Describes how to run the CLBlast routine static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = Omatcopy<T>(args.layout, args.a_transpose, - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, - &queue_plain, &event); - if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #ifdef OPENCL_API + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Omatcopy<T>(args.layout, args.a_transpose, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + #elif CUDA_API + auto status = Omatcopy<T>(args.layout, args.a_transpose, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + queue.GetContext()(), queue.GetDevice()()); + cuStreamSynchronize(queue()); + #endif return status; } diff --git a/test/test_utilities.cpp b/test/test_utilities.cpp index b8fd94a9..84f8894f 100644 --- a/test/test_utilities.cpp +++ b/test/test_utilities.cpp @@ -88,27 +88,29 @@ void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& sour } // As above, but now for OpenCL data-types instead of std::vectors -Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw) { - const auto size = source.GetSize() / sizeof(half); - auto queue = Queue(queue_raw); - auto context = queue.GetContext(); - auto source_cpu = std::vector<half>(size); - source.Read(queue, size, source_cpu); - auto result_cpu = HalfToFloatBuffer(source_cpu); - auto result = Buffer<float>(context, size); - result.Write(queue, size, result_cpu); - return result; -} -void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw) { - const auto size = source.GetSize() / sizeof(float); - auto queue = Queue(queue_raw); - auto context = queue.GetContext(); - auto source_cpu = std::vector<float>(size); - source.Read(queue, size, source_cpu); - auto result_cpu = std::vector<half>(size); - FloatToHalfBuffer(result_cpu, source_cpu); - result.Write(queue, size, result_cpu); -} +#ifdef OPENCL_API + Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, RawCommandQueue queue_raw) { + const auto size = source.GetSize() / sizeof(half); + auto queue = Queue(queue_raw); + auto context = queue.GetContext(); + auto source_cpu = std::vector<half>(size); + source.Read(queue, size, source_cpu); + auto result_cpu = HalfToFloatBuffer(source_cpu); + auto result = Buffer<float>(context, size); + result.Write(queue, size, result_cpu); + return result; + } + void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, RawCommandQueue queue_raw) { + const auto size = source.GetSize() / sizeof(float); + auto queue = Queue(queue_raw); + auto context = queue.GetContext(); + auto source_cpu = std::vector<float>(size); + source.Read(queue, size, source_cpu); + auto result_cpu = std::vector<half>(size); + FloatToHalfBuffer(result_cpu, source_cpu); + result.Write(queue, size, result_cpu); + } +#endif // ================================================================================================= } // namespace clblast diff --git a/test/test_utilities.hpp b/test/test_utilities.hpp index fc50a754..d03c55fc 100644 --- a/test/test_utilities.hpp +++ b/test/test_utilities.hpp @@ -89,8 +89,25 @@ std::vector<float> HalfToFloatBuffer(const std::vector<half>& source); void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source); // As above, but now for OpenCL data-types instead of std::vectors -Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw); -void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw); +#ifdef OPENCL_API + Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, RawCommandQueue queue_raw); + void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, RawCommandQueue queue_raw); +#endif + +// ================================================================================================= + +// Creates a buffer but don't test for validity. That's the reason this is not using the clpp11.h or +// cupp11.h interface. +template <typename T> +Buffer<T> CreateInvalidBuffer(const Context& context, const size_t size) { + #ifdef OPENCL_API + auto raw_buffer = clCreateBuffer(context(), CL_MEM_READ_WRITE, size * sizeof(T), nullptr, nullptr); + #elif CUDA_API + CUdeviceptr raw_buffer; + cuMemAlloc(&raw_buffer, size * sizeof(T)); + #endif + return Buffer<T>(raw_buffer); +} // ================================================================================================= } // namespace clblast diff --git a/test/wrapper_cuda.hpp b/test/wrapper_cuda.hpp index c97ae3ef..12417cdd 100644 --- a/test/wrapper_cuda.hpp +++ b/test/wrapper_cuda.hpp @@ -22,6 +22,7 @@ #include "utilities/utilities.hpp" #ifdef CLBLAST_REF_CUBLAS + #define CUDA_NO_HALF #include <cuda_runtime.h> #include <cublas_v2.h> #endif |