diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-04-02 14:59:39 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-04-02 14:59:39 +0200 |
commit | 5079fbaeffe38cd26ea2fa878bdbb2de5b140bcf (patch) | |
tree | 0f2e85e1e1acef1d22f046499dd0b8a30e5da4f9 | |
parent | a98c00a2671b8981579f3a73dca8fb3365a95e53 (diff) | |
parent | b84d2296b87ac212474af855d916b12adf96bdb7 (diff) |
Merge pull request #143 from CNugteren/test_cblas_timing
CBLAS reference code is now separated from device-host copies
52 files changed, 440 insertions, 512 deletions
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index 3d091b64..0f2661ad 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -353,6 +353,54 @@ void PopulateVector(std::vector<half> &vector, std::mt19937 &mt, std::uniform_re // ================================================================================================= +template <typename T, typename U> +void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host, + Queue &queue, const std::vector<std::string> &names) { + for (auto &name: names) { + if (name == kBufVecX) {buffers_host.x_vec = std::vector<T>(args.x_size, static_cast<T>(0)); buffers.x_vec.Read(queue, args.x_size, buffers_host.x_vec); } + else if (name == kBufVecY) { buffers_host.y_vec = std::vector<T>(args.y_size, static_cast<T>(0)); buffers.y_vec.Read(queue, args.y_size, buffers_host.y_vec); } + else if (name == kBufMatA) { buffers_host.a_mat = std::vector<T>(args.a_size, static_cast<T>(0)); buffers.a_mat.Read(queue, args.a_size, buffers_host.a_mat); } + else if (name == kBufMatB) { buffers_host.b_mat = std::vector<T>(args.b_size, static_cast<T>(0)); buffers.b_mat.Read(queue, args.b_size, buffers_host.b_mat); } + else if (name == kBufMatC) { buffers_host.c_mat = std::vector<T>(args.c_size, static_cast<T>(0)); buffers.c_mat.Read(queue, args.c_size, buffers_host.c_mat); } + else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector<T>(args.ap_size, static_cast<T>(0)); buffers.ap_mat.Read(queue, args.ap_size, buffers_host.ap_mat); } + else if (name == kBufScalar) { buffers_host.scalar = std::vector<T>(args.scalar_size, static_cast<T>(0)); buffers.scalar.Read(queue, args.scalar_size, buffers_host.scalar); } + else { throw std::runtime_error("Invalid buffer name"); } + } +} + +template <typename T, typename U> +void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host, + Queue &queue, const std::vector<std::string> &names) { + for (auto &name: names) { + if (name == kBufVecX) { buffers.x_vec.Write(queue, args.x_size, buffers_host.x_vec); } + else if (name == kBufVecY) { buffers.y_vec.Write(queue, args.y_size, buffers_host.y_vec); } + else if (name == kBufMatA) { buffers.a_mat.Write(queue, args.a_size, buffers_host.a_mat); } + else if (name == kBufMatB) { buffers.b_mat.Write(queue, args.b_size, buffers_host.b_mat); } + else if (name == kBufMatC) { buffers.c_mat.Write(queue, args.c_size, buffers_host.c_mat); } + else if (name == kBufMatAP) { buffers.ap_mat.Write(queue, args.ap_size, buffers_host.ap_mat); } + else if (name == kBufScalar) { buffers.scalar.Write(queue, args.scalar_size, buffers_host.scalar); } + else { throw std::runtime_error("Invalid buffer name"); } + } +} + +// Compiles the above functions +template void DeviceToHost(const Arguments<half>&, Buffers<half>&, BuffersHost<half>&, Queue&, const std::vector<std::string>&); +template void DeviceToHost(const Arguments<float>&, Buffers<float>&, BuffersHost<float>&, Queue&, const std::vector<std::string>&); +template void DeviceToHost(const Arguments<double>&, Buffers<double>&, BuffersHost<double>&, Queue&, const std::vector<std::string>&); +template void DeviceToHost(const Arguments<float>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&); +template void DeviceToHost(const Arguments<double>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&); +template void DeviceToHost(const Arguments<float2>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&); +template void DeviceToHost(const Arguments<double2>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&); +template void HostToDevice(const Arguments<half>&, Buffers<half>&, BuffersHost<half>&, Queue&, const std::vector<std::string>&); +template void HostToDevice(const Arguments<float>&, Buffers<float>&, BuffersHost<float>&, Queue&, const std::vector<std::string>&); +template void HostToDevice(const Arguments<double>&, Buffers<double>&, BuffersHost<double>&, Queue&, const std::vector<std::string>&); +template void HostToDevice(const Arguments<float>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&); +template void HostToDevice(const Arguments<double>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&); +template void HostToDevice(const Arguments<float2>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&); +template void HostToDevice(const Arguments<double2>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&); + +// ================================================================================================= + // Conversion between half and single-precision std::vector<float> HalfToFloatBuffer(const std::vector<half>& source) { auto result = std::vector<float>(source.size()); diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index b3db8c22..535560a3 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -98,6 +98,15 @@ constexpr auto kArgHelp = "h"; constexpr auto kArgQuiet = "q"; constexpr auto kArgNoAbbreviations = "no_abbrv"; +// The buffer names +constexpr auto kBufVecX = "X"; +constexpr auto kBufVecY = "Y"; +constexpr auto kBufMatA = "A"; +constexpr auto kBufMatB = "B"; +constexpr auto kBufMatC = "C"; +constexpr auto kBufMatAP = "AP"; +constexpr auto kBufScalar = "Scalar"; + // ================================================================================================= // Converts a regular or complex type to it's base type (e.g. float2 to float) @@ -202,6 +211,16 @@ struct Buffers { Buffer<T> ap_mat; Buffer<T> scalar; }; +template <typename T> +struct BuffersHost { + std::vector<T> x_vec; + std::vector<T> y_vec; + std::vector<T> a_mat; + std::vector<T> b_mat; + std::vector<T> c_mat; + std::vector<T> ap_mat; + std::vector<T> scalar; +}; // ================================================================================================= @@ -250,6 +269,18 @@ void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_ // ================================================================================================= +// Copies buffers from the OpenCL device to the host +template <typename T, typename U> +void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host, + Queue &queue, const std::vector<std::string> &names); + +// Copies buffers from the host to the OpenCL device +template <typename T, typename U> +void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host, + Queue &queue, const std::vector<std::string> &names); + +// ================================================================================================= + // Conversion between half and single-precision std::vector<float> HalfToFloatBuffer(const std::vector<half>& source); void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source); diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index c8c59fcf..1bfcb623 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -67,15 +67,17 @@ TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool si kBetaValues(GetExampleScalars<U>(full_test_)), prepare_data_(prepare_data), run_routine_(run_routine), + run_reference1_(run_reference1), + run_reference2_(run_reference2), get_result_(get_result), get_index_(get_index), get_id1_(get_id1), get_id2_(get_id2) { - // Sets the reference to test against - if (compare_clblas_) { run_reference_ = run_reference1; } - else if (compare_cblas_) { run_reference_ = run_reference2; } - else { throw std::runtime_error("Invalid configuration: no reference to test against"); } + // Sanity check + if (!compare_clblas_ && !compare_cblas_) { + throw std::runtime_error("Invalid configuration: no reference to test against"); + } // Computes the maximum sizes. This allows for a single set of input/output buffers. const auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end()); @@ -184,7 +186,9 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st else if (compare_cblas_) { fprintf(stdout, " [CPU BLAS]"); } std::cout << std::flush; } - const auto status1 = run_reference_(args, buffers1, queue_); + auto status1 = StatusCode::kSuccess; + if (compare_clblas_) { status1 = run_reference1_(args, buffers1, queue_); } + else if (compare_cblas_) { status1 = run_reference2_(args, buffers1, queue_); } // Tests for equality of the two status codes if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; } @@ -305,7 +309,9 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st else if (compare_cblas_) { fprintf(stdout, " [CPU BLAS]"); } std::cout << std::flush; } - const auto status1 = run_reference_(args, buffers1, queue_); + auto status1 = StatusCode::kSuccess; + if (compare_clblas_) { status1 = run_reference1_(args, buffers1, queue_); } + else if (compare_cblas_) { status1 = run_reference2_(args, buffers1, queue_); } // Tests for equality of the two status codes if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; } diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp index 8c8db348..560ff4d3 100644 --- a/test/correctness/testblas.hpp +++ b/test/correctness/testblas.hpp @@ -109,33 +109,48 @@ class TestBlas: public Tester<T,U> { std::vector<T> scalar_source_; // The routine-specific functions passed to the tester - DataPrepare prepare_data_; - Routine run_routine_; - Routine run_reference_; - ResultGet get_result_; - ResultIndex get_index_; - ResultIterator get_id1_; - ResultIterator get_id2_; + const DataPrepare prepare_data_; + const Routine run_routine_; + const Routine run_reference1_; + const Routine run_reference2_; + const ResultGet get_result_; + const ResultIndex get_index_; + const ResultIterator get_id1_; + const ResultIterator get_id2_; }; // ================================================================================================= +// Bogus reference function, in case a comparison library is not available +template <typename T, typename U, typename BufferType> +static StatusCode ReferenceNotAvailable(const Arguments<U> &, BufferType &, Queue &) { + return StatusCode::kNotImplemented; +} + // The interface to the correctness tester. This is a separate function in the header such that it // is automatically compiled for each routine, templated by the parameter "C". template <typename C, typename T, typename U> size_t RunTests(int argc, char *argv[], const bool silent, const std::string &name) { auto command_line_args = RetrieveCommandLineArguments(argc, argv); - // Sets the reference to test against - #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS) - const auto reference_routine1 = C::RunReference1; // clBLAS - const auto reference_routine2 = C::RunReference2; // CBLAS - #elif CLBLAST_REF_CLBLAS - const auto reference_routine1 = C::RunReference1; // clBLAS - const auto reference_routine2 = C::RunReference1; // not used, dummy - #elif CLBLAST_REF_CBLAS - const auto reference_routine1 = C::RunReference2; // not used, dummy - const auto reference_routine2 = C::RunReference2; // CBLAS + // Sets the clBLAS reference to test against + #ifdef CLBLAST_REF_CLBLAS + auto reference_routine1 = C::RunReference1; // clBLAS when available + #else + auto reference_routine1 = ReferenceNotAvailable<T,U,Buffers<T>>; + #endif + + // Sets the CBLAS reference to test against + #ifdef CLBLAST_REF_CBLAS + auto reference_routine2 = [](const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) -> StatusCode { + auto buffers_host = BuffersHost<T>(); + DeviceToHost(args, buffers, buffers_host, queue, C::BuffersIn()); + C::RunReference2(args, buffers_host, queue); + HostToDevice(args, buffers, buffers_host, queue, C::BuffersOut()); + return StatusCode::kSuccess; + }; + #else + auto reference_routine2 = ReferenceNotAvailable<T,U,Buffers<T>>; #endif // Non-BLAS routines cannot be fully tested diff --git a/test/performance/client.cpp b/test/performance/client.cpp index aa864c8f..48d6708e 100644 --- a/test/performance/client.cpp +++ b/test/performance/client.cpp @@ -29,13 +29,17 @@ template <typename T, typename U> const int Client<T,U>::kSeed = 42; // fixed se // Constructor template <typename T, typename U> Client<T,U>::Client(const Routine run_routine, - const Routine run_reference1, const Routine run_reference2, + const Reference1 run_reference1, const Reference2 run_reference2, const std::vector<std::string> &options, + const std::vector<std::string> &buffers_in, + const std::vector<std::string> &buffers_out, const GetMetric get_flops, const GetMetric get_bytes): run_routine_(run_routine), run_reference1_(run_reference1), run_reference2_(run_reference2), options_(options), + buffers_in_(buffers_in), + buffers_out_(buffers_out), get_flops_(get_flops), get_bytes_(get_bytes) { } @@ -222,7 +226,10 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas)); } if (args.compare_cblas) { - auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS"); + auto buffers_host = BuffersHost<T>(); + DeviceToHost(args, buffers, buffers_host, queue, buffers_in_); + auto ms_cblas = TimedExecution(args.num_runs, args, buffers_host, queue, run_reference2_, "CPU BLAS"); + HostToDevice(args, buffers, buffers_host, queue, buffers_out_); timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas)); } @@ -252,9 +259,10 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) // timing is performed using the milliseconds chrono functions. The function returns the minimum // value found in the vector of timing results. The return value is in milliseconds. template <typename T, typename U> +template <typename BufferType, typename RoutineType> double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args, - Buffers<T> &buffers, Queue &queue, - Routine run_blas, const std::string &library_name) { + BufferType &buffers, Queue &queue, + RoutineType run_blas, const std::string &library_name) { auto status = StatusCode::kSuccess; // Do an optional warm-up to omit compilation times and initialisations from the measurements diff --git a/test/performance/client.hpp b/test/performance/client.hpp index b5cc1465..12fd113d 100644 --- a/test/performance/client.hpp +++ b/test/performance/client.hpp @@ -44,12 +44,15 @@ class Client { // Shorthand for the routine-specific functions passed to the tester using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>; + using Reference1 = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>; + using Reference2 = std::function<StatusCode(const Arguments<U>&, BuffersHost<T>&, Queue&)>; using SetMetric = std::function<void(Arguments<U>&)>; using GetMetric = std::function<size_t(const Arguments<U>&)>; // The constructor - Client(const Routine run_routine, const Routine run_reference1, const Routine run_reference2, + Client(const Routine run_routine, const Reference1 run_reference1, const Reference2 run_reference2, const std::vector<std::string> &options, + const std::vector<std::string> &buffers_in, const std::vector<std::string> &buffers_out, const GetMetric get_flops, const GetMetric get_bytes); // Parses all command-line arguments, filling in the arguments structure. If no command-line @@ -66,8 +69,9 @@ class Client { private: // Runs a function a given number of times and returns the execution time of the shortest instance - double TimedExecution(const size_t num_runs, const Arguments<U> &args, Buffers<T> &buffers, - Queue &queue, Routine run_blas, const std::string &library_name); + template <typename BufferType, typename RoutineType> + double TimedExecution(const size_t num_runs, const Arguments<U> &args, BufferType &buffers, + Queue &queue, RoutineType run_blas, const std::string &library_name); // Prints the header of a performance-data table void PrintTableHeader(const Arguments<U>& args); @@ -78,9 +82,11 @@ class Client { // The routine-specific functions passed to the tester const Routine run_routine_; - const Routine run_reference1_; - const Routine run_reference2_; + const Reference1 run_reference1_; + const Reference2 run_reference2_; const std::vector<std::string> options_; + const std::vector<std::string> buffers_in_; + const std::vector<std::string> buffers_out_; const GetMetric get_flops_; const GetMetric get_bytes_; @@ -91,8 +97,8 @@ class Client { // ================================================================================================= // Bogus reference function, in case a comparison library is not available -template <typename T, typename U> -static StatusCode ReferenceNotAvailable(const Arguments<U> &, Buffers<T> &, Queue &) { +template <typename T, typename U, typename BufferType> +static StatusCode ReferenceNotAvailable(const Arguments<U> &, BufferType &, Queue &) { return StatusCode::kNotImplemented; } @@ -105,17 +111,17 @@ void RunClient(int argc, char *argv[]) { #ifdef CLBLAST_REF_CLBLAS auto reference1 = C::RunReference1; // clBLAS when available #else - auto reference1 = ReferenceNotAvailable<T,U>; + auto reference1 = ReferenceNotAvailable<T,U,Buffers<T>>; #endif #ifdef CLBLAST_REF_CBLAS auto reference2 = C::RunReference2; // CBLAS when available #else - auto reference2 = ReferenceNotAvailable<T,U>; + auto reference2 = ReferenceNotAvailable<T,U,BuffersHost<T>>; #endif // Creates a new client auto client = Client<T,U>(C::RunRoutine, reference1, reference2, C::GetOptions(), - C::GetFlops, C::GetBytes); + C::BuffersIn(), C::BuffersOut(), C::GetFlops, C::GetBytes); // Simple command line argument parser with defaults auto args = client.ParseArguments(argc, argv, C::BLASLevel(), diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp index a22f681f..2e844f2c 100644 --- a/test/routines/level1/xamax.hpp +++ b/test/routines/level1/xamax.hpp @@ -43,6 +43,8 @@ class TestXamax { kArgXInc, kArgXOffset, kArgImaxOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalar}; } + static std::vector<std::string> BuffersOut() { return {kBufScalar}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -101,15 +103,10 @@ class TestXamax { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXamax(args.n, - scalar_cpu, args.imax_offset, - x_vec_cpu, args.x_offset, args.x_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers_host.scalar, args.imax_offset, + buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp index 64377189..8488bfc6 100644 --- a/test/routines/level1/xasum.hpp +++ b/test/routines/level1/xasum.hpp @@ -43,6 +43,8 @@ class TestXasum { kArgXInc, kArgXOffset, kArgAsumOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalar}; } + static std::vector<std::string> BuffersOut() { return {kBufScalar}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -101,15 +103,10 @@ class TestXasum { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXasum(args.n, - scalar_cpu, args.asum_offset, - x_vec_cpu, args.x_offset, args.x_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers_host.scalar, args.asum_offset, + buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp index eba067c0..cc7d251a 100644 --- a/test/routines/level1/xaxpy.hpp +++ b/test/routines/level1/xaxpy.hpp @@ -44,6 +44,8 @@ class TestXaxpy { kArgXOffset, kArgYOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -102,15 +104,10 @@ class TestXaxpy { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXaxpy(args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp index 753f0da5..0dbf0f3d 100644 --- a/test/routines/level1/xcopy.hpp +++ b/test/routines/level1/xcopy.hpp @@ -43,6 +43,8 @@ class TestXcopy { kArgXInc, kArgYInc, kArgXOffset, kArgYOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -101,15 +103,10 @@ class TestXcopy { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXcopy(args.n, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp index 8127247d..bdf2e721 100644 --- a/test/routines/level1/xdot.hpp +++ b/test/routines/level1/xdot.hpp @@ -43,6 +43,8 @@ class TestXdot { kArgXInc, kArgYInc, kArgXOffset, kArgYOffset, kArgDotOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; } + static std::vector<std::string> BuffersOut() { return {kBufScalar}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -107,18 +109,11 @@ class TestXdot { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXdot(args.n, - scalar_cpu, args.dot_offset, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers_host.scalar, args.dot_offset, + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp index 96d97dc4..2cc71b93 100644 --- a/test/routines/level1/xdotc.hpp +++ b/test/routines/level1/xdotc.hpp @@ -43,6 +43,8 @@ class TestXdotc { kArgXInc, kArgYInc, kArgXOffset, kArgYOffset, kArgDotOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; } + static std::vector<std::string> BuffersOut() { return {kBufScalar}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -107,18 +109,11 @@ class TestXdotc { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXdotc(args.n, - scalar_cpu, args.dot_offset, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers_host.scalar, args.dot_offset, + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp index 70c7fceb..272e1e31 100644 --- a/test/routines/level1/xdotu.hpp +++ b/test/routines/level1/xdotu.hpp @@ -43,6 +43,8 @@ class TestXdotu { kArgXInc, kArgYInc, kArgXOffset, kArgYOffset, kArgDotOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; } + static std::vector<std::string> BuffersOut() { return {kBufScalar}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -107,18 +109,11 @@ class TestXdotu { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXdotu(args.n, - scalar_cpu, args.dot_offset, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers_host.scalar, args.dot_offset, + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp index ce33fe59..cb1ec683 100644 --- a/test/routines/level1/xnrm2.hpp +++ b/test/routines/level1/xnrm2.hpp @@ -43,6 +43,8 @@ class TestXnrm2 { kArgXInc, kArgXOffset, kArgNrm2Offset}; } + static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalar}; } + static std::vector<std::string> BuffersOut() { return {kBufScalar}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -101,15 +103,10 @@ class TestXnrm2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXnrm2(args.n, - scalar_cpu, args.nrm2_offset, - x_vec_cpu, args.x_offset, args.x_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers_host.scalar, args.nrm2_offset, + buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp index d89688b4..3e6b9a38 100644 --- a/test/routines/level1/xscal.hpp +++ b/test/routines/level1/xscal.hpp @@ -44,6 +44,8 @@ class TestXscal { kArgXOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufVecX}; } + static std::vector<std::string> BuffersOut() { return {kBufVecX}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -96,12 +98,9 @@ class TestXscal { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXscal(args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp index 49b0d3d0..d9b84dc4 100644 --- a/test/routines/level1/xswap.hpp +++ b/test/routines/level1/xswap.hpp @@ -43,6 +43,8 @@ class TestXswap { kArgXInc, kArgYInc, kArgXOffset, kArgYOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecX, kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -101,16 +103,10 @@ class TestXswap { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXswap(args.n, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp index f371b9a7..990ef49f 100644 --- a/test/routines/level2/xgbmv.hpp +++ b/test/routines/level2/xgbmv.hpp @@ -45,6 +45,8 @@ class TestXgbmv { kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -118,20 +120,13 @@ class TestXgbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXgbmv(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - x_vec_cpu, args.x_offset, args.x_inc, args.beta, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp index 2442be4c..a007cb62 100644 --- a/test/routines/level2/xgemv.hpp +++ b/test/routines/level2/xgemv.hpp @@ -45,6 +45,8 @@ class TestXgemv { kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -118,20 +120,13 @@ class TestXgemv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXgemv(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), args.m, args.n, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - x_vec_cpu, args.x_offset, args.x_inc, args.beta, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp index 3e7ccbc3..5c131e2d 100644 --- a/test/routines/level2/xger.hpp +++ b/test/routines/level2/xger.hpp @@ -45,6 +45,8 @@ class TestXger { kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -113,19 +115,12 @@ class TestXger { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXger(convertToCBLAS(args.layout), args.m, args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc, - a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc, + buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp index d880ae1f..e3544424 100644 --- a/test/routines/level2/xgerc.hpp +++ b/test/routines/level2/xgerc.hpp @@ -45,6 +45,8 @@ class TestXgerc { kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -113,19 +115,12 @@ class TestXgerc { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXgerc(convertToCBLAS(args.layout), args.m, args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc, - a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc, + buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp index 1735e42a..1d81e292 100644 --- a/test/routines/level2/xgeru.hpp +++ b/test/routines/level2/xgeru.hpp @@ -45,6 +45,8 @@ class TestXgeru { kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -113,19 +115,12 @@ class TestXgeru { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXgeru(convertToCBLAS(args.layout), args.m, args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc, - a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc, + buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp index 99538bf1..21194fd6 100644 --- a/test/routines/level2/xhbmv.hpp +++ b/test/routines/level2/xhbmv.hpp @@ -45,6 +45,8 @@ class TestXhbmv { kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -112,20 +114,13 @@ class TestXhbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXhbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.kl, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - x_vec_cpu, args.x_offset, args.x_inc, args.beta, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp index 3792cb66..ffef8ff8 100644 --- a/test/routines/level2/xhemv.hpp +++ b/test/routines/level2/xhemv.hpp @@ -45,6 +45,8 @@ class TestXhemv { kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -112,20 +114,13 @@ class TestXhemv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXhemv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - x_vec_cpu, args.x_offset, args.x_inc, args.beta, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp index c58eb189..083bd3fc 100644 --- a/test/routines/level2/xher.hpp +++ b/test/routines/level2/xher.hpp @@ -45,6 +45,8 @@ class TestXher { kArgAOffset, kArgXOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; } + static std::vector<std::string> BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<U> &args) { @@ -106,17 +108,12 @@ class TestXher { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) { cblasXher(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp index 8a7eb0b6..7bd890a5 100644 --- a/test/routines/level2/xher2.hpp +++ b/test/routines/level2/xher2.hpp @@ -45,6 +45,8 @@ class TestXher2 { kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -112,20 +114,13 @@ class TestXher2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXher2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc, - a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc, + buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp index 0862b619..285dd6d3 100644 --- a/test/routines/level2/xhpmv.hpp +++ b/test/routines/level2/xhpmv.hpp @@ -45,6 +45,8 @@ class TestXhpmv { kArgAPOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -112,20 +114,13 @@ class TestXhpmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXhpmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - ap_mat_cpu, args.ap_offset, - x_vec_cpu, args.x_offset, args.x_inc, args.beta, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers_host.ap_mat, args.ap_offset, + buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp index 5b454174..88bae86b 100644 --- a/test/routines/level2/xhpr.hpp +++ b/test/routines/level2/xhpr.hpp @@ -45,6 +45,8 @@ class TestXhpr { kArgAPOffset, kArgXOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX}; } + static std::vector<std::string> BuffersOut() { return {kBufMatAP}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<U> &args) { @@ -106,17 +108,12 @@ class TestXhpr { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) { cblasXhpr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - ap_mat_cpu, args.ap_offset); - buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.ap_mat, args.ap_offset); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp index b770da2e..cd10fa00 100644 --- a/test/routines/level2/xhpr2.hpp +++ b/test/routines/level2/xhpr2.hpp @@ -45,6 +45,8 @@ class TestXhpr2 { kArgAPOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufMatAP}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -112,20 +114,13 @@ class TestXhpr2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXhpr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc, - ap_mat_cpu, args.ap_offset); - buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc, + buffers_host.ap_mat, args.ap_offset); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp index 7a836170..5c70aba5 100644 --- a/test/routines/level2/xsbmv.hpp +++ b/test/routines/level2/xsbmv.hpp @@ -45,6 +45,8 @@ class TestXsbmv { kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -112,20 +114,13 @@ class TestXsbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXsbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.kl, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - x_vec_cpu, args.x_offset, args.x_inc, args.beta, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp index 352c8cfd..560f5baa 100644 --- a/test/routines/level2/xspmv.hpp +++ b/test/routines/level2/xspmv.hpp @@ -45,6 +45,8 @@ class TestXspmv { kArgAPOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -112,20 +114,13 @@ class TestXspmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXspmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - ap_mat_cpu, args.ap_offset, - x_vec_cpu, args.x_offset, args.x_inc, args.beta, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers_host.ap_mat, args.ap_offset, + buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp index 988bcdc2..2e12db33 100644 --- a/test/routines/level2/xspr.hpp +++ b/test/routines/level2/xspr.hpp @@ -45,6 +45,8 @@ class TestXspr { kArgAPOffset, kArgXOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX}; } + static std::vector<std::string> BuffersOut() { return {kBufMatAP}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -106,17 +108,12 @@ class TestXspr { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXspr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - ap_mat_cpu, args.ap_offset); - buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.ap_mat, args.ap_offset); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp index ee517bc1..a7e22227 100644 --- a/test/routines/level2/xspr2.hpp +++ b/test/routines/level2/xspr2.hpp @@ -45,6 +45,8 @@ class TestXspr2 { kArgAPOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufMatAP}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -112,20 +114,13 @@ class TestXspr2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXspr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc, - ap_mat_cpu, args.ap_offset); - buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc, + buffers_host.ap_mat, args.ap_offset); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp index 5eecfb74..d9cf9c1e 100644 --- a/test/routines/level2/xsymv.hpp +++ b/test/routines/level2/xsymv.hpp @@ -45,6 +45,8 @@ class TestXsymv { kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecY}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -112,20 +114,13 @@ class TestXsymv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXsymv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - x_vec_cpu, args.x_offset, args.x_inc, args.beta, - y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.x_vec, args.x_offset, args.x_inc, args.beta, + buffers_host.y_vec, args.y_offset, args.y_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp index ac4ee1ff..b60c3a36 100644 --- a/test/routines/level2/xsyr.hpp +++ b/test/routines/level2/xsyr.hpp @@ -45,6 +45,8 @@ class TestXsyr { kArgAOffset, kArgXOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; } + static std::vector<std::string> BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -106,17 +108,12 @@ class TestXsyr { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXsyr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp index 43644883..dd10a3d0 100644 --- a/test/routines/level2/xsyr2.hpp +++ b/test/routines/level2/xsyr2.hpp @@ -45,6 +45,8 @@ class TestXsyr2 { kArgAOffset, kArgXOffset, kArgYOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufMatA}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -112,20 +114,13 @@ class TestXsyr2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXsyr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, - x_vec_cpu, args.x_offset, args.x_inc, - y_vec_cpu, args.y_offset, args.y_inc, - a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers_host.x_vec, args.x_offset, args.x_inc, + buffers_host.y_vec, args.y_offset, args.y_inc, + buffers_host.a_mat, args.a_offset, args.a_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp index ab9244af..7eb8ce9e 100644 --- a/test/routines/level2/xtbmv.hpp +++ b/test/routines/level2/xtbmv.hpp @@ -44,6 +44,8 @@ class TestXtbmv { kArgALeadDim, kArgXInc, kArgAOffset, kArgXOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; } + static std::vector<std::string> BuffersOut() { return {kBufVecX}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -107,19 +109,14 @@ class TestXtbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXtbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.n, args.kl, - a_mat_cpu, args.a_offset, args.a_ld, - x_vec_cpu, args.x_offset, args.x_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp index 3821e1a4..7f4842f0 100644 --- a/test/routines/level2/xtpmv.hpp +++ b/test/routines/level2/xtpmv.hpp @@ -44,6 +44,8 @@ class TestXtpmv { kArgXInc, kArgAPOffset, kArgXOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX}; } + static std::vector<std::string> BuffersOut() { return {kBufVecX}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -107,19 +109,14 @@ class TestXtpmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXtpmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.n, - ap_mat_cpu, args.ap_offset, - x_vec_cpu, args.x_offset, args.x_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers_host.ap_mat, args.ap_offset, + buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp index 7211c757..cb7527ed 100644 --- a/test/routines/level2/xtrmv.hpp +++ b/test/routines/level2/xtrmv.hpp @@ -44,6 +44,8 @@ class TestXtrmv { kArgALeadDim, kArgXInc, kArgAOffset, kArgXOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; } + static std::vector<std::string> BuffersOut() { return {kBufVecX}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -107,19 +109,14 @@ class TestXtrmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXtrmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.n, - a_mat_cpu, args.a_offset, args.a_ld, - x_vec_cpu, args.x_offset, args.x_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp index 78b9672f..63d34758 100644 --- a/test/routines/level2/xtrsv.hpp +++ b/test/routines/level2/xtrsv.hpp @@ -44,6 +44,8 @@ class TestXtrsv { kArgALeadDim, kArgXInc, kArgAOffset, kArgXOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; } + static std::vector<std::string> BuffersOut() { return {kBufVecX}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &args) { @@ -122,19 +124,14 @@ class TestXtrsv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXtrsv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.n, - a_mat_cpu, args.a_offset, args.a_ld, - x_vec_cpu, args.x_offset, args.x_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.x_vec, args.x_offset, args.x_inc); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp index 1b12fb1c..a33cbfec 100644 --- a/test/routines/level3/xgemm.hpp +++ b/test/routines/level3/xgemm.hpp @@ -45,6 +45,8 @@ class TestXgemm { kArgAOffset, kArgBOffset, kArgCOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } + static std::vector<std::string> BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments<T> &args) { @@ -121,21 +123,14 @@ class TestXgemm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); - std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXgemm(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), convertToCBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - b_mat_cpu, args.b_offset, args.b_ld, args.beta, - c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.b_mat, args.b_offset, args.b_ld, args.beta, + buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp index 76550b15..74029c7e 100644 --- a/test/routines/level3/xhemm.hpp +++ b/test/routines/level3/xhemm.hpp @@ -45,6 +45,8 @@ class TestXhemm { kArgAOffset, kArgBOffset, kArgCOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } + static std::vector<std::string> BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments<T> &args) { @@ -121,21 +123,14 @@ class TestXhemm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); - std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXhemm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), args.m, args.n, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - b_mat_cpu, args.b_offset, args.b_ld, args.beta, - c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.b_mat, args.b_offset, args.b_ld, args.beta, + buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp index 5ca3aac6..ea13bbc1 100644 --- a/test/routines/level3/xher2k.hpp +++ b/test/routines/level3/xher2k.hpp @@ -45,6 +45,8 @@ class TestXher2k { kArgAOffset, kArgBOffset, kArgCOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } + static std::vector<std::string> BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments<U> &args) { @@ -121,22 +123,15 @@ class TestXher2k { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); - std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) { auto alpha2 = T{args.alpha, args.alpha}; cblasXher2k(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, alpha2, - a_mat_cpu, args.a_offset, args.a_ld, - b_mat_cpu, args.b_offset, args.b_ld, args.beta, - c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.b_mat, args.b_offset, args.b_ld, args.beta, + buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp index e93d887a..b1ce83e0 100644 --- a/test/routines/level3/xherk.hpp +++ b/test/routines/level3/xherk.hpp @@ -45,6 +45,8 @@ class TestXherk { kArgAOffset, kArgCOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatC}; } + static std::vector<std::string> BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments<U> &args) { @@ -110,18 +112,13 @@ class TestXherk { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) { cblasXherk(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, args.beta, - c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, args.beta, + buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp index 9d127e26..6ab644b8 100644 --- a/test/routines/level3/xsymm.hpp +++ b/test/routines/level3/xsymm.hpp @@ -45,6 +45,8 @@ class TestXsymm { kArgAOffset, kArgBOffset, kArgCOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } + static std::vector<std::string> BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments<T> &args) { @@ -121,21 +123,14 @@ class TestXsymm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); - std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXsymm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), args.m, args.n, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - b_mat_cpu, args.b_offset, args.b_ld, args.beta, - c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.b_mat, args.b_offset, args.b_ld, args.beta, + buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp index d1bdac56..1400c4e2 100644 --- a/test/routines/level3/xsyr2k.hpp +++ b/test/routines/level3/xsyr2k.hpp @@ -45,6 +45,8 @@ class TestXsyr2k { kArgAOffset, kArgBOffset, kArgCOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } + static std::vector<std::string> BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments<T> &args) { @@ -119,21 +121,14 @@ class TestXsyr2k { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); - std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXsyr2k(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - b_mat_cpu, args.b_offset, args.b_ld, args.beta, - c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.b_mat, args.b_offset, args.b_ld, args.beta, + buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp index 1330924e..2df8d6b0 100644 --- a/test/routines/level3/xsyrk.hpp +++ b/test/routines/level3/xsyrk.hpp @@ -45,6 +45,8 @@ class TestXsyrk { kArgAOffset, kArgCOffset, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatC}; } + static std::vector<std::string> BuffersOut() { return {kBufMatC}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments<T> &args) { @@ -110,18 +112,13 @@ class TestXsyrk { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXsyrk(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, args.beta, - c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, args.beta, + buffers_host.c_mat, args.c_offset, args.c_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp index 7c5bd842..84adc6e0 100644 --- a/test/routines/level3/xtrmm.hpp +++ b/test/routines/level3/xtrmm.hpp @@ -45,6 +45,8 @@ class TestXtrmm { kArgAOffset, kArgBOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; } + static std::vector<std::string> BuffersOut() { return {kBufMatB}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments<T> &args) { @@ -112,20 +114,15 @@ class TestXtrmm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXtrmm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.m, args.n, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - b_mat_cpu, args.b_offset, args.b_ld); - buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.b_mat, args.b_offset, args.b_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp index a70ef03f..de5b307d 100644 --- a/test/routines/level3/xtrsm.hpp +++ b/test/routines/level3/xtrsm.hpp @@ -47,6 +47,8 @@ class TestXtrsm { kArgAOffset, kArgBOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; } + static std::vector<std::string> BuffersOut() { return {kBufMatB}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments<T> &args) { @@ -124,20 +126,15 @@ class TestXtrsm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { cblasXtrsm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), convertToCBLAS(args.diagonal), args.m, args.n, args.alpha, - a_mat_cpu, args.a_offset, args.a_ld, - b_mat_cpu, args.b_offset, args.b_ld); - buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + buffers_host.a_mat, args.a_offset, args.a_ld, + buffers_host.b_mat, args.b_offset, args.b_ld); return StatusCode::kSuccess; } #endif diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp index ee15ff92..05141bbb 100644 --- a/test/routines/levelx/xaxpybatched.hpp +++ b/test/routines/levelx/xaxpybatched.hpp @@ -45,6 +45,8 @@ class TestXaxpyBatched { kArgXInc, kArgYInc, kArgBatchCount, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; } + static std::vector<std::string> BuffersOut() { return {kBufVecY}; } // Helper for the sizes per batch static size_t PerBatchSizeX(const Arguments<T> &args) { return args.n * args.x_inc; } @@ -123,17 +125,12 @@ class TestXaxpyBatched { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { cblasXaxpy(args.n, args.alphas[batch], - x_vec_cpu, args.x_offsets[batch], args.x_inc, - y_vec_cpu, args.y_offsets[batch], args.y_inc); + buffers_host.x_vec, args.x_offsets[batch], args.x_inc, + buffers_host.y_vec, args.y_offsets[batch], args.y_inc); } - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/levelx/xgemmbatched.hpp b/test/routines/levelx/xgemmbatched.hpp index 80a30e4d..ab5f20c5 100644 --- a/test/routines/levelx/xgemmbatched.hpp +++ b/test/routines/levelx/xgemmbatched.hpp @@ -45,6 +45,8 @@ class TestXgemmBatched { kArgAOffset, kArgBOffset, kArgCOffset, kArgBatchCount, kArgAlpha, kArgBeta}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; } + static std::vector<std::string> BuffersOut() { return {kBufMatC}; } // Helper for the sizes per batch static size_t PerBatchSizeA(const Arguments<T> &args) { @@ -152,23 +154,16 @@ class TestXgemmBatched { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); - std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) { for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { cblasXgemm(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), convertToCBLAS(args.b_transpose), args.m, args.n, args.k, args.alphas[batch], - a_mat_cpu, args.a_offsets[batch], args.a_ld, - b_mat_cpu, args.b_offsets[batch], args.b_ld, args.betas[batch], - c_mat_cpu, args.c_offsets[batch], args.c_ld); + buffers_host.a_mat, args.a_offsets[batch], args.a_ld, + buffers_host.b_mat, args.b_offsets[batch], args.b_ld, args.betas[batch], + buffers_host.c_mat, args.c_offsets[batch], args.c_ld); } - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp index b470dbf3..ffb484b0 100644 --- a/test/routines/levelx/xinvert.hpp +++ b/test/routines/levelx/xinvert.hpp @@ -25,17 +25,10 @@ namespace clblast { // ================================================================================================= template <typename T> -StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { +StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) { const bool is_upper = ((args.triangle == Triangle::kUpper && args.layout != Layout::kRowMajor) || (args.triangle == Triangle::kLower && args.layout == Layout::kRowMajor)); - // Data transfer from OpenCL to std::vector - std::vector<T> a_mat_cpu(args.a_size, T{0.0}); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - - // Creates the output buffer - std::vector<T> b_mat_cpu(args.b_size, T{0.0}); - // Helper variables const auto block_size = args.m; const auto num_blocks = CeilDiv(args.n, block_size); @@ -60,11 +53,11 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu auto a_value = T{1.0}; if (args.diagonal == Diagonal::kNonUnit) { if (i + block_id * block_size < args.n) { - if (a_mat_cpu[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; } - a_value = T{1.0} / a_mat_cpu[i * a_ld + i + a_offset]; + if (buffers_host.a_mat[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; } + a_value = T{1.0} / buffers_host.a_mat[i * a_ld + i + a_offset]; } } - b_mat_cpu[i * b_ld + i + b_offset] = a_value; + buffers_host.b_mat[i * b_ld + i + b_offset] = a_value; } // Inverts the upper triangle row by row @@ -75,11 +68,11 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu for (auto k = i + 1; k <= j; ++k) { auto a_value = T{0.0}; if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) { - a_value = a_mat_cpu[k * a_ld + i + a_offset]; + a_value = buffers_host.a_mat[k * a_ld + i + a_offset]; } - sum += a_value * b_mat_cpu[j * b_ld + k + b_offset]; + sum += a_value * buffers_host.b_mat[j * b_ld + k + b_offset]; } - b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset]; + buffers_host.b_mat[j * b_ld + i + b_offset] = - sum * buffers_host.b_mat[i * b_ld + i + b_offset]; } } } @@ -92,35 +85,32 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu for (auto k = j; k < i; ++k) { auto a_value = T{0.0}; if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) { - a_value = a_mat_cpu[k * a_ld + i + a_offset]; + a_value = buffers_host.a_mat[k * a_ld + i + a_offset]; } - sum += a_value * b_mat_cpu[j * b_ld + k + b_offset]; + sum += a_value * buffers_host.b_mat[j * b_ld + k + b_offset]; } - b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset]; + buffers_host.b_mat[j * b_ld + i + b_offset] = - sum * buffers_host.b_mat[i * b_ld + i + b_offset]; } } } } - - // Data transfer back to OpenCL - buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); return StatusCode::kSuccess; } // Half-precision version calling the above reference implementation after conversions template <> -StatusCode RunReference<half>(const Arguments<half> &args, Buffers<half> &buffers, Queue &queue) { - auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue()); - auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue()); - auto dummy = clblast::Buffer<float>(0); - auto buffers2 = Buffers<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy}; +StatusCode RunReference<half>(const Arguments<half> &args, BuffersHost<half> &buffers_host) { + auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat); + auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat); + auto dummy = std::vector<float>(0); + auto buffers2 = BuffersHost<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy}; auto args2 = Arguments<float>(); args2.a_size = args.a_size; args2.b_size = args.b_size; args2.a_ld = args.a_ld; args2.m = args.m; args2.n = args.n; args2.a_offset = args.a_offset; args2.layout = args.layout; args2.triangle = args.triangle; args2.diagonal = args.diagonal; - auto status = RunReference(args2, buffers2, queue); - FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue()); + auto status = RunReference(args2, buffers2); + FloatToHalfBuffer(buffers_host.b_mat, b_buffer2); return status; } @@ -140,6 +130,8 @@ class TestXinvert { kArgLayout, kArgTriangle, kArgDiagonal, kArgALeadDim, kArgAOffset}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; } + static std::vector<std::string> BuffersOut() { return {kBufMatB}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments<T> &args) { @@ -190,11 +182,15 @@ class TestXinvert { // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + auto buffers_host = BuffersHost<T>(); + DeviceToHost(args, buffers, buffers_host, queue, BuffersIn()); + const auto status = RunReference(args, buffers_host); + HostToDevice(args, buffers, buffers_host, queue, BuffersOut()); + return status; } - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) { + return RunReference(args, buffers_host); } // Describes how to download the results of the computation (more importantly: which buffer) diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp index d1064d0c..d5973b4c 100644 --- a/test/routines/levelx/xomatcopy.hpp +++ b/test/routines/levelx/xomatcopy.hpp @@ -23,13 +23,7 @@ namespace clblast { // ================================================================================================= template <typename T> -StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - - // Data transfer from OpenCL to std::vector - std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); - std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); +StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) { // Checking for invalid arguments const auto a_rotated = (args.layout == Layout::kRowMajor); @@ -40,8 +34,8 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu if ((args.m == 0) || (args.n == 0)) { return StatusCode::kInvalidDimension; } if ((args.a_ld < args.m && !a_rotated) || (args.a_ld < args.n && a_rotated)) { return StatusCode::kInvalidLeadDimA; } if ((args.b_ld < args.m && !b_rotated) || (args.b_ld < args.n && b_rotated)) { return StatusCode::kInvalidLeadDimB; } - if (buffers.a_mat.GetSize() < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; } - if (buffers.b_mat.GetSize() < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; } + if (buffers_host.a_mat.size() * sizeof(T) < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; } + if (buffers_host.b_mat.size() * sizeof(T) < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; } // Matrix copy, scaling, and/or transpose for (auto id1 = size_t{0}; id1 < args.m; ++id1) { @@ -52,30 +46,27 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu const auto b_two = (b_rotated) ? id1 : id2; const auto a_index = a_two * args.a_ld + a_one + args.a_offset; const auto b_index = b_two * args.b_ld + b_one + args.b_offset; - b_mat_cpu[b_index] = args.alpha * a_mat_cpu[a_index]; + buffers_host.b_mat[b_index] = args.alpha * buffers_host.a_mat[a_index]; } } - - // Data transfer back to OpenCL - buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); return StatusCode::kSuccess; } // Half-precision version calling the above reference implementation after conversions template <> -StatusCode RunReference<half>(const Arguments<half> &args, Buffers<half> &buffers, Queue &queue) { - auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue()); - auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue()); - auto dummy = clblast::Buffer<float>(0); - auto buffers2 = Buffers<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy}; +StatusCode RunReference<half>(const Arguments<half> &args, BuffersHost<half> &buffers_host) { + auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat); + auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat); + auto dummy = std::vector<float>(0); + auto buffers2 = BuffersHost<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy}; auto args2 = Arguments<float>(); args2.a_size = args.a_size; args2.b_size = args.b_size; args2.a_ld = args.a_ld; args2.b_ld = args.b_ld; args2.m = args.m; args2.n = args.n; args2.a_offset = args.a_offset; args2.b_offset = args.b_offset; args2.layout = args.layout; args2.a_transpose = args.a_transpose; args2.alpha = HalfToFloat(args.alpha); - auto status = RunReference(args2, buffers2, queue); - FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue()); + auto status = RunReference(args2, buffers2); + FloatToHalfBuffer(buffers_host.b_mat, b_buffer2); return status; } @@ -97,6 +88,8 @@ class TestXomatcopy { kArgAOffset, kArgBOffset, kArgAlpha}; } + static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; } + static std::vector<std::string> BuffersOut() { return {kBufMatB}; } // Describes how to obtain the sizes of the buffers static size_t GetSizeA(const Arguments<T> &args) { @@ -148,11 +141,15 @@ class TestXomatcopy { // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + auto buffers_host = BuffersHost<T>(); + DeviceToHost(args, buffers, buffers_host, queue, BuffersIn()); + const auto status = RunReference(args, buffers_host); + HostToDevice(args, buffers, buffers_host, queue, BuffersOut()); + return status; } - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) { + return RunReference(args, buffers_host); } // Describes how to download the results of the computation (more importantly: which buffer) |