diff options
Diffstat (limited to 'test')
45 files changed, 1667 insertions, 608 deletions
diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index 1329b2c5..cc9a5adb 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -79,24 +79,6 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st // Iterates over all the to-be-tested combinations of arguments for (auto &args: test_vector) { - // Runs the reference clBLAS code - auto x_vec1 = Buffer<T>(context_, args.x_size); - auto y_vec1 = Buffer<T>(context_, args.y_size); - auto a_mat1 = Buffer<T>(context_, args.a_size); - auto b_mat1 = Buffer<T>(context_, args.b_size); - auto c_mat1 = Buffer<T>(context_, args.c_size); - auto ap_mat1 = Buffer<T>(context_, args.ap_size); - auto scalar1 = Buffer<T>(context_, args.scalar_size); - x_vec1.Write(queue_, args.x_size, x_source_); - y_vec1.Write(queue_, args.y_size, y_source_); - a_mat1.Write(queue_, args.a_size, a_source_); - b_mat1.Write(queue_, args.b_size, b_source_); - c_mat1.Write(queue_, args.c_size, c_source_); - ap_mat1.Write(queue_, args.ap_size, ap_source_); - scalar1.Write(queue_, args.scalar_size, scalar_source_); - auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; - auto status1 = run_reference_(args, buffers1, queue_); - // Runs the CLBlast code auto x_vec2 = Buffer<T>(context_, args.x_size); auto y_vec2 = Buffer<T>(context_, args.y_size); @@ -115,6 +97,33 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; auto status2 = run_routine_(args, buffers2, queue_); + #ifndef CLBLAST_REF_CLBLAS + // Don't continue with CBLAS if there are incorrect parameters + if (status2 != StatusCode::kSuccess) { + // TODO: Mark this as a skipped test instead of a succesfull test + TestErrorCodes(status2, status2, args); + continue; + } + #endif + + // Runs the reference BLAS code + auto x_vec1 = Buffer<T>(context_, args.x_size); + auto y_vec1 = Buffer<T>(context_, args.y_size); + auto a_mat1 = Buffer<T>(context_, args.a_size); + auto b_mat1 = Buffer<T>(context_, args.b_size); + auto c_mat1 = Buffer<T>(context_, args.c_size); + auto ap_mat1 = Buffer<T>(context_, args.ap_size); + auto scalar1 = Buffer<T>(context_, args.scalar_size); + x_vec1.Write(queue_, args.x_size, x_source_); + y_vec1.Write(queue_, args.y_size, y_source_); + a_mat1.Write(queue_, args.a_size, a_source_); + b_mat1.Write(queue_, args.b_size, b_source_); + c_mat1.Write(queue_, args.c_size, c_source_); + ap_mat1.Write(queue_, args.ap_size, ap_source_); + scalar1.Write(queue_, args.scalar_size, scalar_source_); + auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; + auto status1 = run_reference_(args, buffers1, queue_); + // Tests for equality of the two status codes if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) { TestErrorCodes(status1, status2, args); diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index 7c9032bd..8181aaf6 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -68,7 +68,7 @@ class TestBlas: public Tester<T,U> { static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file // Shorthand for the routine-specific functions passed to the tester - using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers<T>&, Queue&)>; + using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>; using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers<T>&, Queue&)>; using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>; using ResultIterator = std::function<size_t(const Arguments<U>&)>; @@ -76,8 +76,9 @@ class TestBlas: public Tester<T,U> { // Constructor, initializes the base class tester and input data TestBlas(int argc, char *argv[], const bool silent, const std::string &name, const std::vector<std::string> &options, - const Routine run_routine, const Routine run_reference, const ResultGet get_result, - const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2); + const Routine run_routine, const Routine run_reference, + const ResultGet get_result, const ResultIndex get_index, + const ResultIterator get_id1, const ResultIterator get_id2); // The test functions, taking no inputs void TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name); @@ -110,9 +111,17 @@ class TestBlas: public Tester<T,U> { template <typename C, typename T, typename U> void RunTests(int argc, char *argv[], const bool silent, const std::string &name) { + // Sets the reference to test against + #ifdef CLBLAST_REF_CLBLAS + const auto reference_routine = C::RunReference1; // clBLAS when available + #else + const auto reference_routine = C::RunReference2; // otherwise CBLAS + #endif + // Creates a tester auto options = C::GetOptions(); - TestBlas<T,U> tester{argc, argv, silent, name, options, C::RunRoutine, C::RunReference, + TestBlas<T,U> tester{argc, argv, silent, name, options, + C::RunRoutine, reference_routine, C::DownloadResult, C::GetResultIndex, C::ResultID1, C::ResultID2}; // This variable holds the arguments relevant for this routine @@ -250,23 +259,25 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name } // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector<Arguments<U>>{}; - auto i_args = args; - i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize; - i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize; - for (auto &x_size: x_sizes) { i_args.x_size = x_size; - for (auto &y_size: y_sizes) { i_args.y_size = y_size; - for (auto &a_size: a_sizes) { i_args.a_size = a_size; - for (auto &b_size: b_sizes) { i_args.b_size = b_size; - for (auto &c_size: c_sizes) { i_args.c_size = c_size; - for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size; - invalid_test_vector.push_back(i_args); + #ifdef CLBLAST_REF_CLBLAS + auto invalid_test_vector = std::vector<Arguments<U>>{}; + auto i_args = args; + i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize; + i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize; + for (auto &x_size: x_sizes) { i_args.x_size = x_size; + for (auto &y_size: y_sizes) { i_args.y_size = y_size; + for (auto &a_size: a_sizes) { i_args.a_size = a_size; + for (auto &b_size: b_sizes) { i_args.b_size = b_size; + for (auto &c_size: c_sizes) { i_args.c_size = c_size; + for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size; + invalid_test_vector.push_back(i_args); + } } } } } } - } + #endif // Sets the name of this test-case auto names = std::vector<std::string>{}; @@ -287,7 +298,9 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name // Runs the tests tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); + #ifdef CLBLAST_REF_CLBLAS + tester.TestInvalid(invalid_test_vector, case_name); + #endif } } } diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 8169f700..872a131a 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -69,10 +69,12 @@ Tester<T,U>::Tester(int argc, char *argv[], const bool silent, kUnsupportedPrecision.c_str()); // Initializes clBLAS - auto status = clblasSetup(); - if (status != CL_SUCCESS) { - throw std::runtime_error("clBLAS setup error: "+ToString(static_cast<int>(status))); - } + #ifdef CLBLAST_REF_CLBLAS + auto status = clblasSetup(); + if (status != CL_SUCCESS) { + throw std::runtime_error("clBLAS setup error: "+ToString(static_cast<int>(status))); + } + #endif } // Destructor prints the summary of the test cases and cleans-up the clBLAS library @@ -87,7 +89,11 @@ Tester<T,U>::~Tester() { fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str()); } fprintf(stdout, "\n"); - clblasTeardown(); + + // Cleans-up clBLAS + #ifdef CLBLAST_REF_CLBLAS + clblasTeardown(); + #endif } // ================================================================================================= diff --git a/test/correctness/tester.h b/test/correctness/tester.h index db714f3d..d489f829 100644 --- a/test/correctness/tester.h +++ b/test/correctness/tester.h @@ -23,7 +23,9 @@ #include <memory> // The libraries -#include <clBLAS.h> +#ifdef CLBLAST_REF_CLBLAS + #include <clBLAS.h> +#endif #include "clblast.h" #include "internal/utilities.h" @@ -92,7 +94,7 @@ class Tester { Queue queue_; // Whether or not to run the full test-suite or just a smoke test - bool full_test_; + const bool full_test_; // Retrieves the offset values to test with const std::vector<size_t> GetOffsets() const; diff --git a/test/performance/client.cc b/test/performance/client.cc index 17f54231..56ab8c8d 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -24,11 +24,13 @@ namespace clblast { // Constructor template <typename T, typename U> -Client<T,U>::Client(const Routine run_routine, const Routine run_reference, +Client<T,U>::Client(const Routine run_routine, + const Routine run_reference1, const Routine run_reference2, const std::vector<std::string> &options, const GetMetric get_flops, const GetMetric get_bytes): run_routine_(run_routine), - run_reference_(run_reference), + run_reference1_(run_reference1), + run_reference2_(run_reference2), options_(options), get_flops_(get_flops), get_bytes_(get_bytes) { @@ -90,7 +92,16 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); - args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1); + #ifdef CLBLAST_REF_CLBLAS + args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1); + #else + args.compare_clblas = 0; + #endif + #ifdef CLBLAST_REF_CBLAS + args.compare_cblas = GetArgument(argc, argv, help, kArgComparecblas, 1); + #else + args.compare_cblas = 0; + #endif args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1}); args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0}); args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10}); @@ -120,7 +131,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) auto device = Device(platform, args.device_id); auto context = Context(device); auto queue = Queue(context, device); - if (args.compare_clblas) { clblasSetup(); } + #ifdef CLBLAST_REF_CLBLAS + if (args.compare_clblas) { clblasSetup(); } + #endif // Iterates over all "num_step" values jumping by "step" each time auto s = size_t{0}; @@ -167,9 +180,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast"); timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast)); if (args.compare_clblas) { - auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS"); + auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS"); timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas)); } + if (args.compare_cblas) { + auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS"); + timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas)); + } // Prints the performance of the tested libraries PrintTableRow(args, timings); @@ -186,7 +203,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) } // Cleans-up and returns - if (args.compare_clblas) { clblasTeardown(); } + #ifdef CLBLAST_REF_CLBLAS + if (args.compare_clblas) { clblasTeardown(); } + #endif } // ================================================================================================= @@ -196,14 +215,17 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) // value found in the vector of timing results. The return value is in milliseconds. template <typename T, typename U> double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args, - const Buffers<T> &buffers, Queue &queue, + Buffers<T> &buffers, Queue &queue, Routine run_blas, const std::string &library_name) { auto timings = std::vector<double>(num_runs); for (auto &timing: timings) { auto start_time = std::chrono::steady_clock::now(); // Executes the main computation - auto status = run_blas(args, buffers, queue); + auto status = StatusCode::kSuccess; + try { + status = run_blas(args, buffers, queue); + } catch (...) { status = static_cast<StatusCode>(kUnknownError); } if (status != StatusCode::kSuccess) { throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status))); } @@ -226,6 +248,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) { for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); } fprintf(stdout, " | <-- CLBlast -->"); if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); } + if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); } fprintf(stdout, " |\n"); } @@ -233,6 +256,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) { for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); } fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1"); if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); } + if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); } fprintf(stdout, "\n"); } diff --git a/test/performance/client.h b/test/performance/client.h index 5805b8a5..8d0597d7 100644 --- a/test/performance/client.h +++ b/test/performance/client.h @@ -26,7 +26,9 @@ #include <utility> // The libraries to test -#include <clBLAS.h> +#ifdef CLBLAST_REF_CLBLAS + #include <clBLAS.h> +#endif #include "clblast.h" #include "internal/utilities.h" @@ -40,12 +42,12 @@ class Client { public: // Shorthand for the routine-specific functions passed to the tester - using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers<T>&, Queue&)>; + using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>; using SetMetric = std::function<void(Arguments<U>&)>; using GetMetric = std::function<size_t(const Arguments<U>&)>; // The constructor - Client(const Routine run_routine, const Routine run_reference, + Client(const Routine run_routine, const Routine run_reference1, const Routine run_reference2, const std::vector<std::string> &options, const GetMetric get_flops, const GetMetric get_bytes); @@ -61,7 +63,7 @@ class Client { private: // Runs a function a given number of times and returns the execution time of the shortest instance - double TimedExecution(const size_t num_runs, const Arguments<U> &args, const Buffers<T> &buffers, + double TimedExecution(const size_t num_runs, const Arguments<U> &args, Buffers<T> &buffers, Queue &queue, Routine run_blas, const std::string &library_name); // Prints the header of a performance-data table @@ -73,7 +75,8 @@ class Client { // The routine-specific functions passed to the tester const Routine run_routine_; - const Routine run_reference_; + const Routine run_reference1_; + const Routine run_reference2_; const std::vector<std::string> options_; const GetMetric get_flops_; const GetMetric get_bytes_; @@ -81,13 +84,31 @@ class Client { // ================================================================================================= +// Bogus reference function, in case a comparison library is not available +template <typename T, typename U> +static StatusCode ReferenceNotAvailable(const Arguments<U> &, Buffers<T> &, Queue &) { + return StatusCode::kNotImplemented; +} + // The interface to the performance client. This is a separate function in the header such that it // is automatically compiled for each routine, templated by the parameter "C". template <typename C, typename T, typename U> void RunClient(int argc, char *argv[]) { + // Sets the reference to test against + #ifdef CLBLAST_REF_CLBLAS + const auto reference1 = C::RunReference1; // clBLAS when available + #else + const auto reference1 = ReferenceNotAvailable<T,U>; + #endif + #ifdef CLBLAST_REF_CBLAS + const auto reference2 = C::RunReference2; // CBLAS when available + #else + const auto reference2 = ReferenceNotAvailable<T,U>; + #endif + // Creates a new client - auto client = Client<T,U>(C::RunRoutine, C::RunReference, C::GetOptions(), + auto client = Client<T,U>(C::RunRoutine, reference1, reference2, C::GetOptions(), C::GetFlops, C::GetBytes); // Simple command line argument parser with defaults diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h index 50480f46..8f72f570 100644 --- a/test/routines/level1/xaxpy.h +++ b/test/routines/level1/xaxpy.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXaxpy { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Axpy(args.n, args.alpha, @@ -77,16 +82,33 @@ class TestXaxpy { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXaxpy(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXaxpy(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXaxpy(args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level1/xcopy.h b/test/routines/level1/xcopy.h index 8d324d88..0527ca6a 100644 --- a/test/routines/level1/xcopy.h +++ b/test/routines/level1/xcopy.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -64,7 +69,7 @@ class TestXcopy { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Copy<T>(args.n, @@ -76,16 +81,33 @@ class TestXcopy { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXcopy<T>(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXcopy<T>(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXcopy(args.n, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level1/xdot.h b/test/routines/level1/xdot.h index 04669f52..d1c34c0f 100644 --- a/test/routines/level1/xdot.h +++ b/test/routines/level1/xdot.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -68,7 +73,7 @@ class TestXdot { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dot<T>(args.n, @@ -81,17 +86,37 @@ class TestXdot { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXdot<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdot<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXdot(args.n, + scalar_cpu, args.dot_offset, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level1/xdotc.h b/test/routines/level1/xdotc.h index e5b42ef4..a2742cb0 100644 --- a/test/routines/level1/xdotc.h +++ b/test/routines/level1/xdotc.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -68,7 +73,7 @@ class TestXdotc { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotc<T>(args.n, @@ -81,17 +86,37 @@ class TestXdotc { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXdotc<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdotc<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXdotc(args.n, + scalar_cpu, args.dot_offset, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level1/xdotu.h b/test/routines/level1/xdotu.h index 6430148c..06ce979e 100644 --- a/test/routines/level1/xdotu.h +++ b/test/routines/level1/xdotu.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -68,7 +73,7 @@ class TestXdotu { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotu<T>(args.n, @@ -81,17 +86,37 @@ class TestXdotu { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXdotu<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdotu<T>(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXdotu(args.n, + scalar_cpu, args.dot_offset, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level1/xnrm2.h b/test/routines/level1/xnrm2.h index e3f77ee4..d8a0de4e 100644 --- a/test/routines/level1/xnrm2.h +++ b/test/routines/level1/xnrm2.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -64,7 +69,7 @@ class TestXnrm2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Nrm2<T>(args.n, @@ -76,16 +81,33 @@ class TestXnrm2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXnrm2<T>(args.n, - buffers.scalar(), args.nrm2_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXnrm2<T>(args.n, + buffers.scalar(), args.nrm2_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXnrm2(args.n, + scalar_cpu, args.nrm2_offset, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level1/xscal.h b/test/routines/level1/xscal.h index d990afcc..35855dbd 100644 --- a/test/routines/level1/xscal.h +++ b/test/routines/level1/xscal.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -61,7 +66,7 @@ class TestXscal { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Scal(args.n, args.alpha, @@ -72,15 +77,29 @@ class TestXscal { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXscal(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXscal(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXscal(args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level1/xswap.h b/test/routines/level1/xswap.h index 2096a2c3..ae69d3be 100644 --- a/test/routines/level1/xswap.h +++ b/test/routines/level1/xswap.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -64,7 +69,7 @@ class TestXswap { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Swap<T>(args.n, @@ -76,16 +81,34 @@ class TestXswap { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXswap<T>(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXswap<T>(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXswap(args.n, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h index 0e238804..b875075d 100644 --- a/test/routines/level2/xgbmv.h +++ b/test/routines/level2/xgbmv.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXgbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gbmv(args.layout, args.a_transpose, @@ -90,19 +95,41 @@ class TestXgbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout), - static_cast<clblasTranspose>(args.a_transpose), - args.m, args.n, args.kl, args.ku, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout), + static_cast<clblasTranspose>(args.a_transpose), + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.a_transpose), + args.m, args.n, args.kl, args.ku, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h index 2924d498..a70ccd34 100644 --- a/test/routines/level2/xgemv.h +++ b/test/routines/level2/xgemv.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXgemv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemv(args.layout, args.a_transpose, @@ -90,19 +95,41 @@ class TestXgemv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgemv(static_cast<clblasOrder>(args.layout), - static_cast<clblasTranspose>(args.a_transpose), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgemv(static_cast<clblasOrder>(args.layout), + static_cast<clblasTranspose>(args.a_transpose), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgemv(convertToCBLAS(args.layout), + convertToCBLAS(args.a_transpose), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xger.h b/test/routines/level2/xger.h index 98296e92..32c2a505 100644 --- a/test/routines/level2/xger.h +++ b/test/routines/level2/xger.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -72,7 +77,7 @@ class TestXger { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Ger(args.layout, @@ -86,18 +91,39 @@ class TestXger { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXger(static_cast<clblasOrder>(args.layout), - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXger(static_cast<clblasOrder>(args.layout), + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXger(convertToCBLAS(args.layout), + args.m, args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xgerc.h b/test/routines/level2/xgerc.h index 77258d92..4b6954f6 100644 --- a/test/routines/level2/xgerc.h +++ b/test/routines/level2/xgerc.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -72,7 +77,7 @@ class TestXgerc { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gerc(args.layout, @@ -86,18 +91,39 @@ class TestXgerc { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgerc(static_cast<clblasOrder>(args.layout), - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgerc(static_cast<clblasOrder>(args.layout), + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgerc(convertToCBLAS(args.layout), + args.m, args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xgeru.h b/test/routines/level2/xgeru.h index e5f5f235..295e69e5 100644 --- a/test/routines/level2/xgeru.h +++ b/test/routines/level2/xgeru.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -72,7 +77,7 @@ class TestXgeru { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Geru(args.layout, @@ -86,18 +91,39 @@ class TestXgeru { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgeru(static_cast<clblasOrder>(args.layout), - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgeru(static_cast<clblasOrder>(args.layout), + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgeru(convertToCBLAS(args.layout), + args.m, args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h index 34e1502f..e0bdc4da 100644 --- a/test/routines/level2/xhbmv.h +++ b/test/routines/level2/xhbmv.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hbmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.kl, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xhemv.h b/test/routines/level2/xhemv.h index 80e22157..fa242961 100644 --- a/test/routines/level2/xhemv.h +++ b/test/routines/level2/xhemv.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhemv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhemv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhemv(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhemv(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhemv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xher.h b/test/routines/level2/xher.h index 53c4200f..7d0e8cc3 100644 --- a/test/routines/level2/xher.h +++ b/test/routines/level2/xher.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXher { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Her(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXher { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXher(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXher(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXher(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xher2.h b/test/routines/level2/xher2.h index c12ff827..445bba74 100644 --- a/test/routines/level2/xher2.h +++ b/test/routines/level2/xher2.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXher2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Her2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXher2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXher2(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXher2(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXher2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h index 8fd85b62..406e564f 100644 --- a/test/routines/level2/xhpmv.h +++ b/test/routines/level2/xhpmv.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhpmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhpmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhpmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + ap_mat_cpu, args.ap_offset, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xhpr.h b/test/routines/level2/xhpr.h index 03599ddc..6f56d3f3 100644 --- a/test/routines/level2/xhpr.h +++ b/test/routines/level2/xhpr.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXhpr { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXhpr { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhpr(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhpr(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXhpr(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xhpr2.h b/test/routines/level2/xhpr2.h index 68fbc76d..43889cb9 100644 --- a/test/routines/level2/xhpr2.h +++ b/test/routines/level2/xhpr2.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhpr2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhpr2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhpr2(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhpr2(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhpr2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h index 5bc17e49..9a5c5140 100644 --- a/test/routines/level2/xsbmv.h +++ b/test/routines/level2/xsbmv.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXsbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Sbmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXsbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXsbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.kl, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h index e335da42..913af0cd 100644 --- a/test/routines/level2/xspmv.h +++ b/test/routines/level2/xspmv.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXspmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXspmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXspmv(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXspmv(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXspmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + ap_mat_cpu, args.ap_offset, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xspr.h b/test/routines/level2/xspr.h index 819b1ca8..bab5c541 100644 --- a/test/routines/level2/xspr.h +++ b/test/routines/level2/xspr.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXspr { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXspr { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXspr(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXspr(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXspr(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xspr2.h b/test/routines/level2/xspr2.h index 43d66c9e..41a04cc0 100644 --- a/test/routines/level2/xspr2.h +++ b/test/routines/level2/xspr2.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXspr2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXspr2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXspr2(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXspr2(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXspr2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xsymv.h b/test/routines/level2/xsymv.h index 13473a3e..0576bc1f 100644 --- a/test/routines/level2/xsymv.h +++ b/test/routines/level2/xsymv.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXsymv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Symv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXsymv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsymv(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsymv(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXsymv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xsyr.h b/test/routines/level2/xsyr.h index 66b75c0c..062eea5a 100644 --- a/test/routines/level2/xsyr.h +++ b/test/routines/level2/xsyr.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXsyr { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXsyr { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyr(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXsyr(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xsyr2.h b/test/routines/level2/xsyr2.h index 32497a61..50bc3cea 100644 --- a/test/routines/level2/xsyr2.h +++ b/test/routines/level2/xsyr2.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXsyr2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXsyr2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyr2(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr2(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXsyr2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h index dbdddb65..600b4131 100644 --- a/test/routines/level2/xtbmv.h +++ b/test/routines/level2/xtbmv.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXtbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, @@ -78,20 +83,41 @@ class TestXtbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - static_cast<clblasTranspose>(args.a_transpose), - static_cast<clblasDiag>(args.diagonal), - args.n, args.kl, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + static_cast<clblasDiag>(args.diagonal), + args.n, args.kl, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXtbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.n, args.kl, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h index 4425765e..fc0cf393 100644 --- a/test/routines/level2/xtpmv.h +++ b/test/routines/level2/xtpmv.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXtpmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, @@ -78,20 +83,41 @@ class TestXtpmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - static_cast<clblasTranspose>(args.a_transpose), - static_cast<clblasDiag>(args.diagonal), - args.n, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + static_cast<clblasDiag>(args.diagonal), + args.n, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXtpmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.n, + ap_mat_cpu, args.ap_offset, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h index 1c0c6fd8..fec72124 100644 --- a/test/routines/level2/xtrmv.h +++ b/test/routines/level2/xtrmv.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXtrmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, @@ -78,20 +83,41 @@ class TestXtrmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - static_cast<clblasTranspose>(args.a_transpose), - static_cast<clblasDiag>(args.diagonal), - args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + static_cast<clblasDiag>(args.diagonal), + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXtrmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.n, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h index 695b58b7..49a92936 100644 --- a/test/routines/level3/xgemm.h +++ b/test/routines/level3/xgemm.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -78,7 +83,7 @@ class TestXgemm { static Transposes GetBTransposes(const Transposes &all) { return all; } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, @@ -92,20 +97,43 @@ class TestXgemm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgemm(static_cast<clblasOrder>(args.layout), - static_cast<clblasTranspose>(args.a_transpose), - static_cast<clblasTranspose>(args.b_transpose), - args.m, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgemm(static_cast<clblasOrder>(args.layout), + static_cast<clblasTranspose>(args.a_transpose), + static_cast<clblasTranspose>(args.b_transpose), + args.m, args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); + std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXgemm(convertToCBLAS(args.layout), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.b_transpose), + args.m, args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h index 7b7134e5..40538417 100644 --- a/test/routines/level3/xhemm.h +++ b/test/routines/level3/xhemm.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -78,7 +83,7 @@ class TestXhemm { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemm(args.layout, args.side, args.triangle, @@ -92,20 +97,43 @@ class TestXhemm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhemm(static_cast<clblasOrder>(args.layout), - static_cast<clblasSide>(args.side), - static_cast<clblasUplo>(args.triangle), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhemm(static_cast<clblasOrder>(args.layout), + static_cast<clblasSide>(args.side), + static_cast<clblasUplo>(args.triangle), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); + std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXhemm(convertToCBLAS(args.layout), + convertToCBLAS(args.side), + convertToCBLAS(args.triangle), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h index a7fbfcbe..1ea2ad36 100644 --- a/test/routines/level3/xher2k.h +++ b/test/routines/level3/xher2k.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXher2k { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; @@ -91,21 +96,45 @@ class TestXher2k { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto alpha2 = T{args.alpha, args.alpha}; - auto status = clblasXher2k(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - static_cast<clblasTranspose>(args.a_transpose), - args.n, args.k, alpha2, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto alpha2 = T{args.alpha, args.alpha}; + auto status = clblasXher2k(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + args.n, args.k, alpha2, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); + std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + auto alpha2 = T{args.alpha, args.alpha}; + cblasXher2k(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, alpha2, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h index f097672f..75a7c405 100644 --- a/test/routines/level3/xherk.h +++ b/test/routines/level3/xherk.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -69,7 +74,7 @@ class TestXherk { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Herk(args.layout, args.triangle, args.a_transpose, @@ -82,19 +87,39 @@ class TestXherk { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXherk(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - static_cast<clblasTranspose>(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXherk(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXherk(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h index 03cf5de9..f867c238 100644 --- a/test/routines/level3/xsymm.h +++ b/test/routines/level3/xsymm.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -78,7 +83,7 @@ class TestXsymm { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Symm(args.layout, args.side, args.triangle, @@ -92,20 +97,43 @@ class TestXsymm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsymm(static_cast<clblasOrder>(args.layout), - static_cast<clblasSide>(args.side), - static_cast<clblasUplo>(args.triangle), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsymm(static_cast<clblasOrder>(args.layout), + static_cast<clblasSide>(args.side), + static_cast<clblasUplo>(args.triangle), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); + std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXsymm(convertToCBLAS(args.layout), + convertToCBLAS(args.side), + convertToCBLAS(args.triangle), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h index 89e77f83..be4e1851 100644 --- a/test/routines/level3/xsyr2k.h +++ b/test/routines/level3/xsyr2k.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXsyr2k { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2k(args.layout, args.triangle, args.a_transpose, @@ -90,20 +95,43 @@ class TestXsyr2k { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - static_cast<clblasTranspose>(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); + std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXsyr2k(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h index 8dacb5b3..7675e2aa 100644 --- a/test/routines/level3/xsyrk.h +++ b/test/routines/level3/xsyrk.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -69,7 +74,7 @@ class TestXsyrk { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syrk(args.layout, args.triangle, args.a_transpose, @@ -82,19 +87,39 @@ class TestXsyrk { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout), - static_cast<clblasUplo>(args.triangle), - static_cast<clblasTranspose>(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXsyrk(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h index 152cdf58..a085cb15 100644 --- a/test/routines/level3/xtrmm.h +++ b/test/routines/level3/xtrmm.h @@ -19,7 +19,12 @@ #include <vector> #include <string> -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -69,7 +74,7 @@ class TestXtrmm { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, @@ -82,21 +87,43 @@ class TestXtrmm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout), - static_cast<clblasSide>(args.side), - static_cast<clblasUplo>(args.triangle), - static_cast<clblasTranspose>(args.a_transpose), - static_cast<clblasDiag>(args.diagonal), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast<StatusCode>(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout), + static_cast<clblasSide>(args.side), + static_cast<clblasUplo>(args.triangle), + static_cast<clblasTranspose>(args.a_transpose), + static_cast<clblasDiag>(args.diagonal), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast<StatusCode>(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); + std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + cblasXtrmm(convertToCBLAS(args.layout), + convertToCBLAS(args.side), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld); + buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index c690a45c..dec272b0 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -21,6 +21,13 @@ namespace clblast { +// Conversions from CLBlast types +CBLAS_ORDER convertToCBLAS(const Layout v) { return (v == Layout::kRowMajor) ? CblasRowMajor : CblasColMajor; } +CBLAS_TRANSPOSE convertToCBLAS(const Transpose v) { return (v == Transpose::kNo) ? CblasNoTrans : (v == Transpose::kYes) ? CblasTrans : CblasConjTrans; } +CBLAS_UPLO convertToCBLAS(const Triangle v) { return (v == Triangle::kUpper) ? CblasUpper : CblasLower; } +CBLAS_DIAG convertToCBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? CblasUnit : CblasNonUnit; } +CBLAS_SIDE convertToCBLAS(const Side v) { return (v == Side::kLeft) ? CblasLeft : CblasRight; } + // OpenBLAS is not fully Netlib CBLAS compatible #ifdef OPENBLAS_VERSION using return_pointer_float = openblas_complex_float*; |