summaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
Diffstat (limited to 'test')
-rw-r--r--test/correctness/routines/level3/xgemm.cpp5
-rw-r--r--test/correctness/testblas.cpp44
-rw-r--r--test/correctness/tester.hpp2
-rw-r--r--test/performance/client.hpp2
-rw-r--r--test/routines/level1/xamax.hpp22
-rw-r--r--test/routines/level1/xasum.hpp18
-rw-r--r--test/routines/level1/xaxpy.hpp22
-rw-r--r--test/routines/level1/xcopy.hpp22
-rw-r--r--test/routines/level1/xdot.hpp25
-rw-r--r--test/routines/level1/xdotc.hpp25
-rw-r--r--test/routines/level1/xdotu.hpp25
-rw-r--r--test/routines/level1/xnrm2.hpp22
-rw-r--r--test/routines/level1/xscal.hpp19
-rw-r--r--test/routines/level1/xswap.hpp22
-rw-r--r--test/routines/level2/xgbmv.hpp28
-rw-r--r--test/routines/level2/xgemv.hpp28
-rw-r--r--test/routines/level2/xger.hpp28
-rw-r--r--test/routines/level2/xgerc.hpp28
-rw-r--r--test/routines/level2/xgeru.hpp28
-rw-r--r--test/routines/level2/xhbmv.hpp28
-rw-r--r--test/routines/level2/xhemv.hpp28
-rw-r--r--test/routines/level2/xher.hpp25
-rw-r--r--test/routines/level2/xher2.hpp28
-rw-r--r--test/routines/level2/xhpmv.hpp28
-rw-r--r--test/routines/level2/xhpr.hpp25
-rw-r--r--test/routines/level2/xhpr2.hpp28
-rw-r--r--test/routines/level2/xsbmv.hpp28
-rw-r--r--test/routines/level2/xspmv.hpp28
-rw-r--r--test/routines/level2/xspr.hpp25
-rw-r--r--test/routines/level2/xspr2.hpp28
-rw-r--r--test/routines/level2/xsymv.hpp28
-rw-r--r--test/routines/level2/xsyr.hpp25
-rw-r--r--test/routines/level2/xsyr2.hpp28
-rw-r--r--test/routines/level2/xtbmv.hpp25
-rw-r--r--test/routines/level2/xtpmv.hpp25
-rw-r--r--test/routines/level2/xtrmv.hpp25
-rw-r--r--test/routines/level2/xtrsv.hpp25
-rw-r--r--test/routines/level3/xgemm.hpp28
-rw-r--r--test/routines/level3/xhemm.hpp28
-rw-r--r--test/routines/level3/xher2k.hpp28
-rw-r--r--test/routines/level3/xherk.hpp25
-rw-r--r--test/routines/level3/xsymm.hpp28
-rw-r--r--test/routines/level3/xsyr2k.hpp28
-rw-r--r--test/routines/level3/xsyrk.hpp25
-rw-r--r--test/routines/level3/xtrmm.hpp25
-rw-r--r--test/routines/level3/xtrsm.hpp25
-rw-r--r--test/routines/levelx/xaxpybatched.hpp25
-rw-r--r--test/routines/levelx/xgemmbatched.hpp31
-rw-r--r--test/routines/levelx/xim2col.hpp34
-rw-r--r--test/routines/levelx/xinvert.hpp25
-rw-r--r--test/routines/levelx/xomatcopy.hpp25
-rw-r--r--test/test_utilities.cpp44
-rw-r--r--test/test_utilities.hpp21
-rw-r--r--test/wrapper_cuda.hpp1
54 files changed, 891 insertions, 450 deletions
diff --git a/test/correctness/routines/level3/xgemm.cpp b/test/correctness/routines/level3/xgemm.cpp
index bdf57b36..351e538b 100644
--- a/test/correctness/routines/level3/xgemm.cpp
+++ b/test/correctness/routines/level3/xgemm.cpp
@@ -15,21 +15,16 @@
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
auto errors = size_t{0};
-
- // Tests GEMM based on the 'in-direct' kernel
errors += clblast::RunTests<clblast::TestXgemm<1, float>, float, float>(argc, argv, false, "SGEMM");
errors += clblast::RunTests<clblast::TestXgemm<1, double>, double, double>(argc, argv, true, "DGEMM");
errors += clblast::RunTests<clblast::TestXgemm<1, clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM");
errors += clblast::RunTests<clblast::TestXgemm<1, clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM");
errors += clblast::RunTests<clblast::TestXgemm<1, clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGEMM");
-
- // Tests GEMM based on the 'direct' kernel
errors += clblast::RunTests<clblast::TestXgemm<2, float>, float, float>(argc, argv, true, "SGEMM");
errors += clblast::RunTests<clblast::TestXgemm<2, double>, double, double>(argc, argv, true, "DGEMM");
errors += clblast::RunTests<clblast::TestXgemm<2, clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CGEMM");
errors += clblast::RunTests<clblast::TestXgemm<2, clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZGEMM");
errors += clblast::RunTests<clblast::TestXgemm<2, clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HGEMM");
-
if (errors > 0) { return 1; } else { return 0; }
}
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 659131c5..aa4b4785 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -241,36 +241,22 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
std::cout << std::flush;
}
- // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
+ // Creates the buffers. Note: we are not using the cxpp11.h C++ version since we explicitly
// want to be able to create invalid buffers (no error checking here).
- auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
- auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
- auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
- auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
- auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
- auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
- auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr);
- auto x_vec1 = Buffer<T>(x1);
- auto y_vec1 = Buffer<T>(y1);
- auto a_mat1 = Buffer<T>(a1);
- auto b_mat1 = Buffer<T>(b1);
- auto c_mat1 = Buffer<T>(c1);
- auto ap_mat1 = Buffer<T>(ap1);
- auto scalar1 = Buffer<T>(d1);
- auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
- auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
- auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
- auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
- auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
- auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
- auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr);
- auto x_vec2 = Buffer<T>(x2);
- auto y_vec2 = Buffer<T>(y2);
- auto a_mat2 = Buffer<T>(a2);
- auto b_mat2 = Buffer<T>(b2);
- auto c_mat2 = Buffer<T>(c2);
- auto ap_mat2 = Buffer<T>(ap2);
- auto scalar2 = Buffer<T>(d2);
+ auto x_vec1 = CreateInvalidBuffer<T>(context_, args.x_size);
+ auto y_vec1 = CreateInvalidBuffer<T>(context_, args.y_size);
+ auto a_mat1 = CreateInvalidBuffer<T>(context_, args.a_size);
+ auto b_mat1 = CreateInvalidBuffer<T>(context_, args.b_size);
+ auto c_mat1 = CreateInvalidBuffer<T>(context_, args.c_size);
+ auto ap_mat1 = CreateInvalidBuffer<T>(context_, args.ap_size);
+ auto scalar1 = CreateInvalidBuffer<T>(context_, args.scalar_size);
+ auto x_vec2 = CreateInvalidBuffer<T>(context_, args.x_size);
+ auto y_vec2 = CreateInvalidBuffer<T>(context_, args.y_size);
+ auto a_mat2 = CreateInvalidBuffer<T>(context_, args.a_size);
+ auto b_mat2 = CreateInvalidBuffer<T>(context_, args.b_size);
+ auto c_mat2 = CreateInvalidBuffer<T>(context_, args.c_size);
+ auto ap_mat2 = CreateInvalidBuffer<T>(context_, args.ap_size);
+ auto scalar2 = CreateInvalidBuffer<T>(context_, args.scalar_size);
auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp
index caf03787..640f870a 100644
--- a/test/correctness/tester.hpp
+++ b/test/correctness/tester.hpp
@@ -22,13 +22,13 @@
#include <vector>
#include <memory>
+#include "utilities/utilities.hpp"
#include "test/test_utilities.hpp"
// The libraries
#ifdef CLBLAST_REF_CLBLAS
#include <clBLAS.h>
#endif
-#include "clblast.h"
namespace clblast {
// =================================================================================================
diff --git a/test/performance/client.hpp b/test/performance/client.hpp
index 2ba09cb9..0b6176c8 100644
--- a/test/performance/client.hpp
+++ b/test/performance/client.hpp
@@ -32,7 +32,7 @@
#include <clBLAS.h>
#endif
#include "test/wrapper_cuda.hpp"
-#include "clblast.h"
+#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp
index 868a79ed..d74807c9 100644
--- a/test/routines/level1/xamax.hpp
+++ b/test/routines/level1/xamax.hpp
@@ -69,13 +69,21 @@ class TestXamax {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Amax<T>(args.n,
- buffers.scalar(), args.imax_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Amax<T>(args.n,
+ buffers.scalar(), args.imax_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Amax<T>(args.n,
+ buffers.scalar(), args.imax_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp
index 6add9c64..573f1223 100644
--- a/test/routines/level1/xasum.hpp
+++ b/test/routines/level1/xasum.hpp
@@ -69,13 +69,21 @@ class TestXasum {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+ #ifdef OPENCL_API
auto queue_plain = queue();
auto event = cl_event{};
- auto status = Asum<T>(args.n,
- buffers.scalar(), args.asum_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ auto status = Asum<T>(args.n,
+ buffers.scalar(), args.asum_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Asum<T>(args.n,
+ buffers.scalar(), args.asum_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp
index 17cae6ad..7491a9e8 100644
--- a/test/routines/level1/xaxpy.hpp
+++ b/test/routines/level1/xaxpy.hpp
@@ -70,13 +70,21 @@ class TestXaxpy {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Axpy(args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Axpy(args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Axpy(args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp
index 7a5c99b8..58abdbf4 100644
--- a/test/routines/level1/xcopy.hpp
+++ b/test/routines/level1/xcopy.hpp
@@ -69,13 +69,21 @@ class TestXcopy {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Copy<T>(args.n,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Copy<T>(args.n,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Copy<T>(args.n,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp
index 1ea25994..229d18c9 100644
--- a/test/routines/level1/xdot.hpp
+++ b/test/routines/level1/xdot.hpp
@@ -73,14 +73,23 @@ class TestXdot {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Dot<T>(args.n,
- buffers.scalar(), args.dot_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Dot<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Dot<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp
index c800c1f5..9a1dc33a 100644
--- a/test/routines/level1/xdotc.hpp
+++ b/test/routines/level1/xdotc.hpp
@@ -73,14 +73,23 @@ class TestXdotc {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Dotc<T>(args.n,
- buffers.scalar(), args.dot_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Dotc<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Dotc<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp
index 3545a3a6..4b2c7647 100644
--- a/test/routines/level1/xdotu.hpp
+++ b/test/routines/level1/xdotu.hpp
@@ -73,14 +73,23 @@ class TestXdotu {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Dotu<T>(args.n,
- buffers.scalar(), args.dot_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Dotu<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Dotu<T>(args.n,
+ buffers.scalar(), args.dot_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp
index 1db70537..f3a789b5 100644
--- a/test/routines/level1/xnrm2.hpp
+++ b/test/routines/level1/xnrm2.hpp
@@ -69,13 +69,21 @@ class TestXnrm2 {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Nrm2<T>(args.n,
- buffers.scalar(), args.nrm2_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Nrm2<T>(args.n,
+ buffers.scalar(), args.nrm2_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Nrm2<T>(args.n,
+ buffers.scalar(), args.nrm2_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp
index efa0988d..95038032 100644
--- a/test/routines/level1/xscal.hpp
+++ b/test/routines/level1/xscal.hpp
@@ -66,12 +66,19 @@ class TestXscal {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Scal(args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Scal(args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Scal(args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp
index d778cc23..58310698 100644
--- a/test/routines/level1/xswap.hpp
+++ b/test/routines/level1/xswap.hpp
@@ -69,13 +69,21 @@ class TestXswap {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Swap<T>(args.n,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Swap<T>(args.n,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Swap<T>(args.n,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp
index 23138c77..7c198e5d 100644
--- a/test/routines/level2/xgbmv.hpp
+++ b/test/routines/level2/xgbmv.hpp
@@ -81,15 +81,25 @@ class TestXgbmv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Gbmv(args.layout, args.a_transpose,
- args.m, args.n, args.kl, args.ku, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Gbmv(args.layout, args.a_transpose,
+ args.m, args.n, args.kl, args.ku, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Gbmv(args.layout, args.a_transpose,
+ args.m, args.n, args.kl, args.ku, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp
index 0ee53b80..780e2976 100644
--- a/test/routines/level2/xgemv.hpp
+++ b/test/routines/level2/xgemv.hpp
@@ -81,15 +81,25 @@ class TestXgemv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Gemv(args.layout, args.a_transpose,
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Gemv(args.layout, args.a_transpose,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Gemv(args.layout, args.a_transpose,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp
index 92a1a2ae..9c5e2e40 100644
--- a/test/routines/level2/xger.hpp
+++ b/test/routines/level2/xger.hpp
@@ -77,15 +77,25 @@ class TestXger {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Ger(args.layout,
- args.m, args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Ger(args.layout,
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Ger(args.layout,
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp
index 5d899398..5f58b65d 100644
--- a/test/routines/level2/xgerc.hpp
+++ b/test/routines/level2/xgerc.hpp
@@ -77,15 +77,25 @@ class TestXgerc {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Gerc(args.layout,
- args.m, args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Gerc(args.layout,
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Gerc(args.layout,
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp
index 96dab22e..fea3932c 100644
--- a/test/routines/level2/xgeru.hpp
+++ b/test/routines/level2/xgeru.hpp
@@ -77,15 +77,25 @@ class TestXgeru {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Geru(args.layout,
- args.m, args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Geru(args.layout,
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Geru(args.layout,
+ args.m, args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp
index b6844744..0ccd69b7 100644
--- a/test/routines/level2/xhbmv.hpp
+++ b/test/routines/level2/xhbmv.hpp
@@ -75,15 +75,25 @@ class TestXhbmv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Hbmv(args.layout, args.triangle,
- args.n, args.kl, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Hbmv(args.layout, args.triangle,
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Hbmv(args.layout, args.triangle,
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp
index e1f23592..053bc2dc 100644
--- a/test/routines/level2/xhemv.hpp
+++ b/test/routines/level2/xhemv.hpp
@@ -75,15 +75,25 @@ class TestXhemv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Hemv(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Hemv(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Hemv(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp
index 1ac1247b..745df43f 100644
--- a/test/routines/level2/xher.hpp
+++ b/test/routines/level2/xher.hpp
@@ -71,14 +71,23 @@ class TestXher {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Her(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Her(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Her(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp
index 18ccc1ac..794e9a1e 100644
--- a/test/routines/level2/xher2.hpp
+++ b/test/routines/level2/xher2.hpp
@@ -75,15 +75,25 @@ class TestXher2 {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Her2(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Her2(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Her2(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp
index ad91fe15..157272d3 100644
--- a/test/routines/level2/xhpmv.hpp
+++ b/test/routines/level2/xhpmv.hpp
@@ -75,15 +75,25 @@ class TestXhpmv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Hpmv(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.ap_mat(), args.ap_offset,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Hpmv(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Hpmv(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp
index f9d580cd..a3bc60d1 100644
--- a/test/routines/level2/xhpr.hpp
+++ b/test/routines/level2/xhpr.hpp
@@ -71,14 +71,23 @@ class TestXhpr {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Hpr(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.ap_mat(), args.ap_offset,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Hpr(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.ap_mat(), args.ap_offset,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Hpr(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.ap_mat(), args.ap_offset,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp
index f946ba5c..1aa6cc54 100644
--- a/test/routines/level2/xhpr2.hpp
+++ b/test/routines/level2/xhpr2.hpp
@@ -75,15 +75,25 @@ class TestXhpr2 {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Hpr2(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.ap_mat(), args.ap_offset,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Hpr2(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.ap_mat(), args.ap_offset,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Hpr2(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.ap_mat(), args.ap_offset,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp
index 6481d19b..51d6441e 100644
--- a/test/routines/level2/xsbmv.hpp
+++ b/test/routines/level2/xsbmv.hpp
@@ -75,15 +75,25 @@ class TestXsbmv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Sbmv(args.layout, args.triangle,
- args.n, args.kl, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Sbmv(args.layout, args.triangle,
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Sbmv(args.layout, args.triangle,
+ args.n, args.kl, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp
index 9815dbee..f3089836 100644
--- a/test/routines/level2/xspmv.hpp
+++ b/test/routines/level2/xspmv.hpp
@@ -75,15 +75,25 @@ class TestXspmv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Spmv(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.ap_mat(), args.ap_offset,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Spmv(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Spmv(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp
index 01a50c38..d76de610 100644
--- a/test/routines/level2/xspr.hpp
+++ b/test/routines/level2/xspr.hpp
@@ -71,14 +71,23 @@ class TestXspr {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Spr(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.ap_mat(), args.ap_offset,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Spr(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.ap_mat(), args.ap_offset,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Spr(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.ap_mat(), args.ap_offset,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp
index 55f8a141..5ce82a52 100644
--- a/test/routines/level2/xspr2.hpp
+++ b/test/routines/level2/xspr2.hpp
@@ -75,15 +75,25 @@ class TestXspr2 {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Spr2(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.ap_mat(), args.ap_offset,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Spr2(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.ap_mat(), args.ap_offset,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Spr2(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.ap_mat(), args.ap_offset,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp
index aec0dfb0..2a70756d 100644
--- a/test/routines/level2/xsymv.hpp
+++ b/test/routines/level2/xsymv.hpp
@@ -75,15 +75,25 @@ class TestXsymv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Symv(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
- buffers.y_vec(), args.y_offset, args.y_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Symv(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Symv(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp
index 78b686d8..02aad990 100644
--- a/test/routines/level2/xsyr.hpp
+++ b/test/routines/level2/xsyr.hpp
@@ -71,14 +71,23 @@ class TestXsyr {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Syr(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Syr(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Syr(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp
index 38aa4f43..492a9d2d 100644
--- a/test/routines/level2/xsyr2.hpp
+++ b/test/routines/level2/xsyr2.hpp
@@ -75,15 +75,25 @@ class TestXsyr2 {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Syr2(args.layout, args.triangle,
- args.n, args.alpha,
- buffers.x_vec(), args.x_offset, args.x_inc,
- buffers.y_vec(), args.y_offset, args.y_inc,
- buffers.a_mat(), args.a_offset, args.a_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Syr2(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Syr2(args.layout, args.triangle,
+ args.n, args.alpha,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ buffers.y_vec(), args.y_offset, args.y_inc,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp
index 8c7aa381..587676ca 100644
--- a/test/routines/level2/xtbmv.hpp
+++ b/test/routines/level2/xtbmv.hpp
@@ -70,14 +70,23 @@ class TestXtbmv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
- args.n, args.kl,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n, args.kl,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n, args.kl,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp
index 3afab978..02f334a2 100644
--- a/test/routines/level2/xtpmv.hpp
+++ b/test/routines/level2/xtpmv.hpp
@@ -70,14 +70,23 @@ class TestXtpmv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
- args.n,
- buffers.ap_mat(), args.ap_offset,
- buffers.x_vec(), args.x_offset, args.x_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n,
+ buffers.ap_mat(), args.ap_offset,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp
index 2b71f151..4f2dd582 100644
--- a/test/routines/level2/xtrmv.hpp
+++ b/test/routines/level2/xtrmv.hpp
@@ -70,14 +70,23 @@ class TestXtrmv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
- args.n,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp
index 85b50e85..aec8eace 100644
--- a/test/routines/level2/xtrsv.hpp
+++ b/test/routines/level2/xtrsv.hpp
@@ -85,14 +85,23 @@ class TestXtrsv {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
- args.n,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.x_vec(), args.x_offset, args.x_inc,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+ args.n,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.x_vec(), args.x_offset, args.x_inc,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp
index 1c430c1c..8444c1c3 100644
--- a/test/routines/level3/xgemm.hpp
+++ b/test/routines/level3/xgemm.hpp
@@ -90,15 +90,25 @@ class TestXgemm {
{{"XGEMM_MIN_INDIRECT_SIZE", switch_threshold}});
if (override_status != StatusCode::kSuccess) { return override_status; }
}
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
- args.m, args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
+ args.m, args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
+ args.m, args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp
index a89617b5..3b70d3f1 100644
--- a/test/routines/level3/xhemm.hpp
+++ b/test/routines/level3/xhemm.hpp
@@ -83,15 +83,25 @@ class TestXhemm {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Hemm(args.layout, args.side, args.triangle,
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Hemm(args.layout, args.side, args.triangle,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Hemm(args.layout, args.side, args.triangle,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp
index 55e6d894..6c4e12f1 100644
--- a/test/routines/level3/xher2k.hpp
+++ b/test/routines/level3/xher2k.hpp
@@ -81,16 +81,26 @@ class TestXher2k {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
auto alpha2 = T{args.alpha, args.alpha};
- auto status = Her2k(args.layout, args.triangle, args.a_transpose,
- args.n, args.k, alpha2,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Her2k(args.layout, args.triangle, args.a_transpose,
+ args.n, args.k, alpha2,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Her2k(args.layout, args.triangle, args.a_transpose,
+ args.n, args.k, alpha2,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp
index 3e1e7e02..c1bb7a0b 100644
--- a/test/routines/level3/xherk.hpp
+++ b/test/routines/level3/xherk.hpp
@@ -74,14 +74,23 @@ class TestXherk {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Herk(args.layout, args.triangle, args.a_transpose,
- args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Herk(args.layout, args.triangle, args.a_transpose,
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Herk(args.layout, args.triangle, args.a_transpose,
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp
index 5d840d40..90cc1888 100644
--- a/test/routines/level3/xsymm.hpp
+++ b/test/routines/level3/xsymm.hpp
@@ -83,15 +83,25 @@ class TestXsymm {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Symm(args.layout, args.side, args.triangle,
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Symm(args.layout, args.side, args.triangle,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Symm(args.layout, args.side, args.triangle,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp
index 4a4a2f10..6b29aff7 100644
--- a/test/routines/level3/xsyr2k.hpp
+++ b/test/routines/level3/xsyr2k.hpp
@@ -81,15 +81,25 @@ class TestXsyr2k {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
- args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp
index 90e46727..b7782176 100644
--- a/test/routines/level3/xsyrk.hpp
+++ b/test/routines/level3/xsyrk.hpp
@@ -74,14 +74,23 @@ class TestXsyrk {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Syrk(args.layout, args.triangle, args.a_transpose,
- args.n, args.k, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
- buffers.c_mat(), args.c_offset, args.c_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Syrk(args.layout, args.triangle, args.a_transpose,
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Syrk(args.layout, args.triangle, args.a_transpose,
+ args.n, args.k, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+ buffers.c_mat(), args.c_offset, args.c_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp
index acc00e01..62d0f573 100644
--- a/test/routines/level3/xtrmm.hpp
+++ b/test/routines/level3/xtrmm.hpp
@@ -74,14 +74,23 @@ class TestXtrmm {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp
index d63c9d79..9ce1f09c 100644
--- a/test/routines/level3/xtrsm.hpp
+++ b/test/routines/level3/xtrsm.hpp
@@ -85,14 +85,23 @@ class TestXtrsm {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp
index 4a8fc564..e9715f4e 100644
--- a/test/routines/levelx/xaxpybatched.hpp
+++ b/test/routines/levelx/xaxpybatched.hpp
@@ -83,14 +83,23 @@ class TestXaxpyBatched {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = AxpyBatched(args.n, args.alphas.data(),
- buffers.x_vec(), args.x_offsets.data(), args.x_inc,
- buffers.y_vec(), args.y_offsets.data(), args.y_inc,
- args.batch_count,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = AxpyBatched(args.n, args.alphas.data(),
+ buffers.x_vec(), args.x_offsets.data(), args.x_inc,
+ buffers.y_vec(), args.y_offsets.data(), args.y_inc,
+ args.batch_count,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = AxpyBatched(args.n, args.alphas.data(),
+ buffers.x_vec(), args.x_offsets.data(), args.x_inc,
+ buffers.y_vec(), args.y_offsets.data(), args.y_inc,
+ args.batch_count,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/levelx/xgemmbatched.hpp b/test/routines/levelx/xgemmbatched.hpp
index 704d0578..2a8bd9d4 100644
--- a/test/routines/levelx/xgemmbatched.hpp
+++ b/test/routines/levelx/xgemmbatched.hpp
@@ -108,8 +108,6 @@ class TestXgemmBatched {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
// Relaxed requirement on ld_a and ld_b within the library, this is here to match clBLAS
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
@@ -119,14 +117,27 @@ class TestXgemmBatched {
auto b_one = (!b_rotated) ? args.k : args.n;
if (args.a_ld < a_one) { return StatusCode::kInvalidLeadDimA; }
if (args.b_ld < b_one) { return StatusCode::kInvalidLeadDimB; }
- auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose,
- args.m, args.n, args.k, args.alphas.data(),
- buffers.a_mat(), args.a_offsets.data(), args.a_ld,
- buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(),
- buffers.c_mat(), args.c_offsets.data(), args.c_ld,
- args.batch_count,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose,
+ args.m, args.n, args.k, args.alphas.data(),
+ buffers.a_mat(), args.a_offsets.data(), args.a_ld,
+ buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(),
+ buffers.c_mat(), args.c_offsets.data(), args.c_ld,
+ args.batch_count,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose,
+ args.m, args.n, args.k, args.alphas.data(),
+ buffers.a_mat(), args.a_offsets.data(), args.a_ld,
+ buffers.b_mat(), args.b_offsets.data(), args.b_ld, args.betas.data(),
+ buffers.c_mat(), args.c_offsets.data(), args.c_ld,
+ args.batch_count,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/levelx/xim2col.hpp b/test/routines/levelx/xim2col.hpp
index 4124190f..ebffe85e 100644
--- a/test/routines/levelx/xim2col.hpp
+++ b/test/routines/levelx/xim2col.hpp
@@ -84,17 +84,29 @@ public:
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Im2col<T>(args.channels, args.height, args.width,
- args.kernel_h, args.kernel_w,
- args.pad_h, args.pad_w,
- args.stride_h, args.stride_w,
- args.dilation_h, args.dilation_w,
- buffers.a_mat(), args.a_offset,
- buffers.b_mat(), args.b_offset,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Im2col<T>(args.channels, args.height, args.width,
+ args.kernel_h, args.kernel_w,
+ args.pad_h, args.pad_w,
+ args.stride_h, args.stride_w,
+ args.dilation_h, args.dilation_w,
+ buffers.a_mat(), args.a_offset,
+ buffers.b_mat(), args.b_offset,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Im2col<T>(args.channels, args.height, args.width,
+ args.kernel_h, args.kernel_w,
+ args.pad_h, args.pad_w,
+ args.stride_h, args.stride_w,
+ args.dilation_h, args.dilation_w,
+ buffers.a_mat(), args.a_offset,
+ buffers.b_mat(), args.b_offset,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp
index cc02a88b..3df1e2b0 100644
--- a/test/routines/levelx/xinvert.hpp
+++ b/test/routines/levelx/xinvert.hpp
@@ -164,14 +164,23 @@ class TestXinvert {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
try {
- auto event = cl_event{};
- auto inverter = Xinvert<T>(queue, &event);
- inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal,
- args.n, args.m,
- buffers.a_mat, args.a_offset, args.a_ld,
- buffers.b_mat);
- clWaitForEvents(1, &event);
- clReleaseEvent(event);
+ #ifdef OPENCL_API
+ auto event = cl_event{};
+ auto inverter = Xinvert<T>(queue, &event);
+ inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal,
+ args.n, args.m,
+ buffers.a_mat, args.a_offset, args.a_ld,
+ buffers.b_mat);
+ clWaitForEvents(1, &event);
+ clReleaseEvent(event);
+ #elif CUDA_API
+ auto inverter = Xinvert<T>(queue, nullptr);
+ inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal,
+ args.n, args.m,
+ buffers.a_mat, args.a_offset, args.a_ld,
+ buffers.b_mat);
+ cuStreamSynchronize(queue());
+ #endif
} catch (...) { return DispatchException(); }
return StatusCode::kSuccess;
}
diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp
index 2736cf75..70bda452 100644
--- a/test/routines/levelx/xomatcopy.hpp
+++ b/test/routines/levelx/xomatcopy.hpp
@@ -126,14 +126,23 @@ class TestXomatcopy {
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
- auto queue_plain = queue();
- auto event = cl_event{};
- auto status = Omatcopy<T>(args.layout, args.a_transpose,
- args.m, args.n, args.alpha,
- buffers.a_mat(), args.a_offset, args.a_ld,
- buffers.b_mat(), args.b_offset, args.b_ld,
- &queue_plain, &event);
- if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #ifdef OPENCL_API
+ auto queue_plain = queue();
+ auto event = cl_event{};
+ auto status = Omatcopy<T>(args.layout, args.a_transpose,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld,
+ &queue_plain, &event);
+ if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+ #elif CUDA_API
+ auto status = Omatcopy<T>(args.layout, args.a_transpose,
+ args.m, args.n, args.alpha,
+ buffers.a_mat(), args.a_offset, args.a_ld,
+ buffers.b_mat(), args.b_offset, args.b_ld,
+ queue.GetContext()(), queue.GetDevice()());
+ cuStreamSynchronize(queue());
+ #endif
return status;
}
diff --git a/test/test_utilities.cpp b/test/test_utilities.cpp
index 579eb61c..84f8894f 100644
--- a/test/test_utilities.cpp
+++ b/test/test_utilities.cpp
@@ -88,27 +88,29 @@ void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& sour
}
// As above, but now for OpenCL data-types instead of std::vectors
-Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, RawCommandQueue queue_raw) {
- const auto size = source.GetSize() / sizeof(half);
- auto queue = Queue(queue_raw);
- auto context = queue.GetContext();
- auto source_cpu = std::vector<half>(size);
- source.Read(queue, size, source_cpu);
- auto result_cpu = HalfToFloatBuffer(source_cpu);
- auto result = Buffer<float>(context, size);
- result.Write(queue, size, result_cpu);
- return result;
-}
-void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, RawCommandQueue queue_raw) {
- const auto size = source.GetSize() / sizeof(float);
- auto queue = Queue(queue_raw);
- auto context = queue.GetContext();
- auto source_cpu = std::vector<float>(size);
- source.Read(queue, size, source_cpu);
- auto result_cpu = std::vector<half>(size);
- FloatToHalfBuffer(result_cpu, source_cpu);
- result.Write(queue, size, result_cpu);
-}
+#ifdef OPENCL_API
+ Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, RawCommandQueue queue_raw) {
+ const auto size = source.GetSize() / sizeof(half);
+ auto queue = Queue(queue_raw);
+ auto context = queue.GetContext();
+ auto source_cpu = std::vector<half>(size);
+ source.Read(queue, size, source_cpu);
+ auto result_cpu = HalfToFloatBuffer(source_cpu);
+ auto result = Buffer<float>(context, size);
+ result.Write(queue, size, result_cpu);
+ return result;
+ }
+ void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, RawCommandQueue queue_raw) {
+ const auto size = source.GetSize() / sizeof(float);
+ auto queue = Queue(queue_raw);
+ auto context = queue.GetContext();
+ auto source_cpu = std::vector<float>(size);
+ source.Read(queue, size, source_cpu);
+ auto result_cpu = std::vector<half>(size);
+ FloatToHalfBuffer(result_cpu, source_cpu);
+ result.Write(queue, size, result_cpu);
+ }
+#endif
// =================================================================================================
} // namespace clblast
diff --git a/test/test_utilities.hpp b/test/test_utilities.hpp
index fe7a9cd2..d03c55fc 100644
--- a/test/test_utilities.hpp
+++ b/test/test_utilities.hpp
@@ -89,8 +89,25 @@ std::vector<float> HalfToFloatBuffer(const std::vector<half>& source);
void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source);
// As above, but now for OpenCL data-types instead of std::vectors
-Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, RawCommandQueue queue_raw);
-void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, RawCommandQueue queue_raw);
+#ifdef OPENCL_API
+ Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, RawCommandQueue queue_raw);
+ void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, RawCommandQueue queue_raw);
+#endif
+
+// =================================================================================================
+
+// Creates a buffer but don't test for validity. That's the reason this is not using the clpp11.h or
+// cupp11.h interface.
+template <typename T>
+Buffer<T> CreateInvalidBuffer(const Context& context, const size_t size) {
+ #ifdef OPENCL_API
+ auto raw_buffer = clCreateBuffer(context(), CL_MEM_READ_WRITE, size * sizeof(T), nullptr, nullptr);
+ #elif CUDA_API
+ CUdeviceptr raw_buffer;
+ cuMemAlloc(&raw_buffer, size * sizeof(T));
+ #endif
+ return Buffer<T>(raw_buffer);
+}
// =================================================================================================
} // namespace clblast
diff --git a/test/wrapper_cuda.hpp b/test/wrapper_cuda.hpp
index c97ae3ef..12417cdd 100644
--- a/test/wrapper_cuda.hpp
+++ b/test/wrapper_cuda.hpp
@@ -22,6 +22,7 @@
#include "utilities/utilities.hpp"
#ifdef CLBLAST_REF_CUBLAS
+ #define CUDA_NO_HALF
#include <cuda_runtime.h>
#include <cublas_v2.h>
#endif