diff options
51 files changed, 663 insertions, 626 deletions
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index 330db597..58dc3b27 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -73,6 +73,7 @@ constexpr auto kArgAsumOffset = "offasum"; constexpr auto kArgImaxOffset = "offimax"; constexpr auto kArgAlpha = "alpha"; constexpr auto kArgBeta = "beta"; +constexpr auto kArgBatchCount = "batch_count"; // The tuner-specific arguments in string form constexpr auto kArgFraction = "fraction"; @@ -156,6 +157,8 @@ struct Arguments { size_t imax_offset = 0; T alpha = ConstantOne<T>(); T beta = ConstantOne<T>(); + size_t batch_count = 1; + // Sizes size_t x_size = 1; size_t y_size = 1; size_t a_size = 1; diff --git a/test/correctness/misc/override_parameters.cpp b/test/correctness/misc/override_parameters.cpp index e6eebef7..c6c70d9f 100644 --- a/test/correctness/misc/override_parameters.cpp +++ b/test/correctness/misc/override_parameters.cpp @@ -88,7 +88,7 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st device_b.Write(queue, host_b.size(), host_b); device_c.Write(queue, host_c.size(), host_c); auto dummy = Buffer<T>(context, 1); - auto buffers = Buffers<T>{dummy, dummy, device_a, device_b, device_c, dummy, dummy}; + auto buffers = std::vector<Buffers<T>>{Buffers<T>{dummy, dummy, device_a, device_b, device_c, dummy, dummy}}; // Loops over the valid combinations: run before and run afterwards fprintf(stdout, "* Testing OverrideParameters for '%s'\n", routine_name.c_str()); diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index 505b3b36..fcb2eceb 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -27,6 +27,7 @@ template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kIncr template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixDims = { 7, 64 }; template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 256 }; template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBandSizes = { 4, 19 }; +template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBatchCounts = { 1, 3 }; // Test settings for the invalid tests template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kInvalidIncrements = { 0, 1 }; @@ -81,15 +82,16 @@ TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool si const auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); const auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end()); const auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end()); + const auto max_batch_count = *std::max_element(kBatchCounts.begin(), kBatchCounts.end()); // Creates test input data - x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset); - y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset); - a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); - b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); - c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); - ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset); - scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset); + x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset); + y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset); + a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); + b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); + c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); + ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset); + scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset); std::mt19937 mt(kSeed); std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit); PopulateVector(x_source_, mt, dist); @@ -124,21 +126,24 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st ap_source_, scalar_source_); // Set-up for the CLBlast run - auto x_vec2 = Buffer<T>(context_, args.x_size); - auto y_vec2 = Buffer<T>(context_, args.y_size); - auto a_mat2 = Buffer<T>(context_, args.a_size); - auto b_mat2 = Buffer<T>(context_, args.b_size); - auto c_mat2 = Buffer<T>(context_, args.c_size); - auto ap_mat2 = Buffer<T>(context_, args.ap_size); - auto scalar2 = Buffer<T>(context_, args.scalar_size); - x_vec2.Write(queue_, args.x_size, x_source_); - y_vec2.Write(queue_, args.y_size, y_source_); - a_mat2.Write(queue_, args.a_size, a_source_); - b_mat2.Write(queue_, args.b_size, b_source_); - c_mat2.Write(queue_, args.c_size, c_source_); - ap_mat2.Write(queue_, args.ap_size, ap_source_); - scalar2.Write(queue_, args.scalar_size, scalar_source_); - auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; + auto buffers2 = std::vector<Buffers<T>>(); + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + auto x_vec2 = Buffer<T>(context_, args.x_size); + auto y_vec2 = Buffer<T>(context_, args.y_size); + auto a_mat2 = Buffer<T>(context_, args.a_size); + auto b_mat2 = Buffer<T>(context_, args.b_size); + auto c_mat2 = Buffer<T>(context_, args.c_size); + auto ap_mat2 = Buffer<T>(context_, args.ap_size); + auto scalar2 = Buffer<T>(context_, args.scalar_size); + x_vec2.Write(queue_, args.x_size, &x_source_[batch * args.x_size]); + y_vec2.Write(queue_, args.y_size, &y_source_[batch * args.y_size]); + a_mat2.Write(queue_, args.a_size, &a_source_[batch * args.a_size]); + b_mat2.Write(queue_, args.b_size, &b_source_[batch * args.b_size]); + c_mat2.Write(queue_, args.c_size, &c_source_[batch * args.c_size]); + ap_mat2.Write(queue_, args.ap_size, &ap_source_[batch * args.ap_size]); + scalar2.Write(queue_, args.scalar_size, &scalar_source_[batch * args.scalar_size]); + buffers2.push_back(Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}); + } // Runs CLBlast if (verbose_) { @@ -158,21 +163,24 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st } // Set-up for the reference run - auto x_vec1 = Buffer<T>(context_, args.x_size); - auto y_vec1 = Buffer<T>(context_, args.y_size); - auto a_mat1 = Buffer<T>(context_, args.a_size); - auto b_mat1 = Buffer<T>(context_, args.b_size); - auto c_mat1 = Buffer<T>(context_, args.c_size); - auto ap_mat1 = Buffer<T>(context_, args.ap_size); - auto scalar1 = Buffer<T>(context_, args.scalar_size); - x_vec1.Write(queue_, args.x_size, x_source_); - y_vec1.Write(queue_, args.y_size, y_source_); - a_mat1.Write(queue_, args.a_size, a_source_); - b_mat1.Write(queue_, args.b_size, b_source_); - c_mat1.Write(queue_, args.c_size, c_source_); - ap_mat1.Write(queue_, args.ap_size, ap_source_); - scalar1.Write(queue_, args.scalar_size, scalar_source_); - auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; + auto buffers1 = std::vector<Buffers<T>>(); + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + auto x_vec1 = Buffer<T>(context_, args.x_size); + auto y_vec1 = Buffer<T>(context_, args.y_size); + auto a_mat1 = Buffer<T>(context_, args.a_size); + auto b_mat1 = Buffer<T>(context_, args.b_size); + auto c_mat1 = Buffer<T>(context_, args.c_size); + auto ap_mat1 = Buffer<T>(context_, args.ap_size); + auto scalar1 = Buffer<T>(context_, args.scalar_size); + x_vec1.Write(queue_, args.x_size, &x_source_[batch * args.x_size]); + y_vec1.Write(queue_, args.y_size, &y_source_[batch * args.y_size]); + a_mat1.Write(queue_, args.a_size, &a_source_[batch * args.a_size]); + b_mat1.Write(queue_, args.b_size, &b_source_[batch * args.b_size]); + c_mat1.Write(queue_, args.c_size, &c_source_[batch * args.c_size]); + ap_mat1.Write(queue_, args.ap_size, &ap_source_[batch * args.ap_size]); + scalar1.Write(queue_, args.scalar_size, &scalar_source_[batch * args.scalar_size]); + buffers1.push_back(Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}); + } // Runs the reference code if (verbose_) { @@ -189,46 +197,55 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st continue; } - // Downloads the results - auto result1 = get_result_(args, buffers1, queue_); - auto result2 = get_result_(args, buffers2, queue_); - - // Computes the L2 error - const auto kErrorMarginL2 = getL2ErrorMargin<T>(); + // Error checking for each batch + auto errors = size_t{0}; auto l2error = 0.0; - for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) { - for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) { - auto index = get_index_(args, id1, id2); - l2error += SquaredDifference(result1[index], result2[index]); + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + + // Downloads the results + auto result1 = get_result_(args, buffers1[batch], queue_); + auto result2 = get_result_(args, buffers2[batch], queue_); + + // Computes the L2 error + auto l2error_batch = 0.0; + const auto kErrorMarginL2 = getL2ErrorMargin<T>(); + for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) { + for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) { + auto index = get_index_(args, id1, id2); + l2error_batch += SquaredDifference(result1[index], result2[index]); + } } - } - l2error /= (get_id1_(args) * get_id2_(args)); - - // Checks for differences in the output - auto errors = size_t{0}; - for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) { - for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) { - auto index = get_index_(args, id1, id2); - if (!TestSimilarity(result1[index], result2[index])) { - if (l2error >= kErrorMarginL2) { errors++; } - if (verbose_) { - if (get_id2_(args) == 1) { fprintf(stdout, "\n Error at index %zu: ", id1); } - else { fprintf(stdout, "\n Error at %zu,%zu: ", id1, id2); } - fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str()); - fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str()); - if (l2error < kErrorMarginL2) { - fprintf(stdout, " - error suppressed by a low total L2 error\n"); + l2error_batch /= static_cast<double>(get_id1_(args) * get_id2_(args)); + l2error += l2error_batch; + + // Checks for differences in the output + for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) { + for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) { + auto index = get_index_(args, id1, id2); + if (!TestSimilarity(result1[index], result2[index])) { + if (l2error_batch >= kErrorMarginL2) { errors++; } + if (verbose_) { + if (get_id2_(args) == 1) { fprintf(stdout, "\n Error at index %zu: ", id1); } + else { fprintf(stdout, "\n Error at %zu,%zu: ", id1, id2); } + fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str()); + fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str()); + if (l2error_batch < kErrorMarginL2) { + fprintf(stdout, " - error suppressed by a low total L2 error\n"); + } } } } } } + l2error /= static_cast<double>(args.batch_count); + + // Report the results if (verbose_ && errors > 0) { - fprintf(stdout, "\n Combined L2 error: %.2e\n ", l2error); + fprintf(stdout, "\n Combined average L2 error: %.2e\n ", l2error); } // Tests the error count (should be zero) - TestErrorCount(errors, get_id1_(args)*get_id2_(args), args); + TestErrorCount(errors, get_id1_(args)*get_id2_(args)*args.batch_count, args); } TestEnd(); } @@ -255,36 +272,40 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly // want to be able to create invalid buffers (no error checking here). - auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); - auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); - auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); - auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); - auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); - auto x_vec1 = Buffer<T>(x1); - auto y_vec1 = Buffer<T>(y1); - auto a_mat1 = Buffer<T>(a1); - auto b_mat1 = Buffer<T>(b1); - auto c_mat1 = Buffer<T>(c1); - auto ap_mat1 = Buffer<T>(ap1); - auto scalar1 = Buffer<T>(d1); - auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); - auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); - auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); - auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); - auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); - auto x_vec2 = Buffer<T>(x2); - auto y_vec2 = Buffer<T>(y2); - auto a_mat2 = Buffer<T>(a2); - auto b_mat2 = Buffer<T>(b2); - auto c_mat2 = Buffer<T>(c2); - auto ap_mat2 = Buffer<T>(ap2); - auto scalar2 = Buffer<T>(d2); - auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; - auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; + auto buffers1 = std::vector<Buffers<T>>(); + auto buffers2 = std::vector<Buffers<T>>(); + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); + auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); + auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); + auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); + auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); + auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); + auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); + auto x_vec1 = Buffer<T>(x1); + auto y_vec1 = Buffer<T>(y1); + auto a_mat1 = Buffer<T>(a1); + auto b_mat1 = Buffer<T>(b1); + auto c_mat1 = Buffer<T>(c1); + auto ap_mat1 = Buffer<T>(ap1); + auto scalar1 = Buffer<T>(d1); + auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); + auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); + auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); + auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); + auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); + auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); + auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); + auto x_vec2 = Buffer<T>(x2); + auto y_vec2 = Buffer<T>(y2); + auto a_mat2 = Buffer<T>(a2); + auto b_mat2 = Buffer<T>(b2); + auto c_mat2 = Buffer<T>(c2); + auto ap_mat2 = Buffer<T>(ap2); + auto scalar2 = Buffer<T>(d2); + buffers1.push_back(Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}); + buffers2.push_back(Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}); + } // Runs CLBlast if (verbose_) { diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp index ee795aad..e675fa9b 100644 --- a/test/correctness/testblas.hpp +++ b/test/correctness/testblas.hpp @@ -56,6 +56,7 @@ class TestBlas: public Tester<T,U> { static const std::vector<size_t> kMatrixDims; static const std::vector<size_t> kMatrixVectorDims; static const std::vector<size_t> kBandSizes; + static const std::vector<size_t> kBatchCounts; const std::vector<size_t> kOffsets; const std::vector<U> kAlphaValues; const std::vector<U> kBetaValues; @@ -78,7 +79,7 @@ class TestBlas: public Tester<T,U> { std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&)>; - using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>; + using Routine = std::function<StatusCode(const Arguments<U>&, std::vector<Buffers<T>>&, Queue&)>; using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers<T>&, Queue&)>; using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>; using ResultIterator = std::function<size_t(const Arguments<U>&)>; @@ -183,6 +184,7 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na auto imax_offsets = std::vector<size_t>{args.imax_offset}; auto alphas = std::vector<U>{args.alpha}; auto betas = std::vector<U>{args.beta}; + auto batch_counts = std::vector<size_t>{args.batch_count}; auto x_sizes = std::vector<size_t>{args.x_size}; auto y_sizes = std::vector<size_t>{args.y_size}; auto a_sizes = std::vector<size_t>{args.a_size}; @@ -226,6 +228,7 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na if (option == kArgImaxOffset) { imax_offsets = tester.kOffsets; } if (option == kArgAlpha) { alphas = tester.kAlphaValues; } if (option == kArgBeta) { betas = tester.kBetaValues; } + if (option == kArgBatchCount) { batch_counts = tester.kBatchCounts; } if (option == kArgXOffset) { x_sizes = tester.kVecSizes; } if (option == kArgYOffset) { y_sizes = tester.kVecSizes; } @@ -268,8 +271,10 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na for (auto &imax_offset: imax_offsets) { r_args.imax_offset = imax_offset; for (auto &alpha: alphas) { r_args.alpha = alpha; for (auto &beta: betas) { r_args.beta = beta; - C::SetSizes(r_args); - regular_test_vector.push_back(r_args); + for (auto &batch_count: batch_counts) { r_args.batch_count = batch_count; + C::SetSizes(r_args); + regular_test_vector.push_back(r_args); + } } } } diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp index cbfc5bb2..40784fdb 100644 --- a/test/correctness/tester.cpp +++ b/test/correctness/tester.cpp @@ -367,6 +367,7 @@ std::string Tester<T,U>::GetOptionsString(const Arguments<U> &args) { if (o == kArgDotOffset){ result += kArgDotOffset + equals + ToString(args.dot_offset) + " "; } if (o == kArgAlpha) { result += kArgAlpha + equals + ToString(args.alpha) + " "; } if (o == kArgBeta) { result += kArgBeta + equals + ToString(args.beta) + " "; } + if (o == kArgBatchCount){result += kArgBatchCount + equals + ToString(args.batch_count) + " "; } } return result; } diff --git a/test/performance/client.cpp b/test/performance/client.cpp index 16b44b5a..b1d5b718 100644 --- a/test/performance/client.cpp +++ b/test/performance/client.cpp @@ -94,6 +94,9 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le // Scalar values if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar<U>()); } if (o == kArgBeta) { args.beta = GetArgument(command_line_args, help, kArgBeta, GetScalar<U>()); } + + // Batch arguments + if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, size_t{1}); } } // These are the options common to all routines @@ -174,13 +177,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) set_sizes(args); // Populates input host matrices with random data - std::vector<T> x_source(args.x_size); - std::vector<T> y_source(args.y_size); - std::vector<T> a_source(args.a_size); - std::vector<T> b_source(args.b_size); - std::vector<T> c_source(args.c_size); - std::vector<T> ap_source(args.ap_size); - std::vector<T> scalar_source(args.scalar_size); + std::vector<T> x_source(args.batch_count * args.x_size); + std::vector<T> y_source(args.batch_count * args.y_size); + std::vector<T> a_source(args.batch_count * args.a_size); + std::vector<T> b_source(args.batch_count * args.b_size); + std::vector<T> c_source(args.batch_count * args.c_size); + std::vector<T> ap_source(args.batch_count * args.ap_size); + std::vector<T> scalar_source(args.batch_count * args.scalar_size); std::mt19937 mt(kSeed); std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit); PopulateVector(x_source, mt, dist); @@ -192,21 +195,24 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) PopulateVector(scalar_source, mt, dist); // Creates the matrices on the device - auto x_vec = Buffer<T>(context, args.x_size); - auto y_vec = Buffer<T>(context, args.y_size); - auto a_mat = Buffer<T>(context, args.a_size); - auto b_mat = Buffer<T>(context, args.b_size); - auto c_mat = Buffer<T>(context, args.c_size); - auto ap_mat = Buffer<T>(context, args.ap_size); - auto scalar = Buffer<T>(context, args.scalar_size); - x_vec.Write(queue, args.x_size, x_source); - y_vec.Write(queue, args.y_size, y_source); - a_mat.Write(queue, args.a_size, a_source); - b_mat.Write(queue, args.b_size, b_source); - c_mat.Write(queue, args.c_size, c_source); - ap_mat.Write(queue, args.ap_size, ap_source); - scalar.Write(queue, args.scalar_size, scalar_source); - auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar}; + auto buffers = std::vector<Buffers<T>>(); + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + auto x_vec = Buffer<T>(context, args.x_size); + auto y_vec = Buffer<T>(context, args.y_size); + auto a_mat = Buffer<T>(context, args.a_size); + auto b_mat = Buffer<T>(context, args.b_size); + auto c_mat = Buffer<T>(context, args.c_size); + auto ap_mat = Buffer<T>(context, args.ap_size); + auto scalar = Buffer<T>(context, args.scalar_size); + x_vec.Write(queue, args.x_size, &x_source[batch * args.x_size]); + y_vec.Write(queue, args.y_size, &y_source[batch * args.y_size]); + a_mat.Write(queue, args.a_size, &a_source[batch * args.a_size]); + b_mat.Write(queue, args.b_size, &b_source[batch * args.b_size]); + c_mat.Write(queue, args.c_size, &c_source[batch * args.c_size]); + ap_mat.Write(queue, args.ap_size, &ap_source[batch * args.ap_size]); + scalar.Write(queue, args.scalar_size, &scalar_source[batch * args.scalar_size]); + buffers.push_back(Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar}); + } // Runs the routines and collects the timings auto timings = std::vector<std::pair<std::string, double>>(); @@ -248,7 +254,7 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) // value found in the vector of timing results. The return value is in milliseconds. template <typename T, typename U> double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args, - Buffers<T> &buffers, Queue &queue, + std::vector<Buffers<T>> &buffers, Queue &queue, Routine run_blas, const std::string &library_name) { auto status = StatusCode::kSuccess; @@ -339,6 +345,7 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); } else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); } else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); } + else if (o == kArgBatchCount){integers.push_back(args.batch_count); } } auto strings = std::vector<std::string>{}; for (auto &o: options_) { diff --git a/test/performance/client.hpp b/test/performance/client.hpp index 4b3e17c7..a8e31419 100644 --- a/test/performance/client.hpp +++ b/test/performance/client.hpp @@ -43,7 +43,7 @@ class Client { static constexpr auto kSeed = 42; // fixed seed for reproducibility // Shorthand for the routine-specific functions passed to the tester - using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>; + using Routine = std::function<StatusCode(const Arguments<U>&, std::vector<Buffers<T>>&, Queue&)>; using SetMetric = std::function<void(Arguments<U>&)>; using GetMetric = std::function<size_t(const Arguments<U>&)>; @@ -66,7 +66,7 @@ class Client { private: // Runs a function a given number of times and returns the execution time of the shortest instance - double TimedExecution(const size_t num_runs, const Arguments<U> &args, Buffers<T> &buffers, + double TimedExecution(const size_t num_runs, const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue, Routine run_blas, const std::string &library_name); // Prints the header of a performance-data table diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp index a22f681f..faffff33 100644 --- a/test/routines/level1/xamax.hpp +++ b/test/routines/level1/xamax.hpp @@ -74,12 +74,12 @@ class TestXamax { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Amax<T>(args.n, - buffers.scalar(), args.imax_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers[0].scalar(), args.imax_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -87,12 +87,12 @@ class TestXamax { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXamax<T>(args.n, - buffers.scalar, args.imax_offset, - buffers.x_vec, args.x_offset, args.x_inc, + buffers[0].scalar, args.imax_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -101,15 +101,15 @@ class TestXamax { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXamax(args.n, scalar_cpu, args.imax_offset, x_vec_cpu, args.x_offset, args.x_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp index 64377189..fb2c9f1a 100644 --- a/test/routines/level1/xasum.hpp +++ b/test/routines/level1/xasum.hpp @@ -74,12 +74,12 @@ class TestXasum { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Asum<T>(args.n, - buffers.scalar(), args.asum_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers[0].scalar(), args.asum_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -87,12 +87,12 @@ class TestXasum { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXasum<T>(args.n, - buffers.scalar, args.asum_offset, - buffers.x_vec, args.x_offset, args.x_inc, + buffers[0].scalar, args.asum_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -101,15 +101,15 @@ class TestXasum { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXasum(args.n, scalar_cpu, args.asum_offset, x_vec_cpu, args.x_offset, args.x_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp index eba067c0..1c74f67f 100644 --- a/test/routines/level1/xaxpy.hpp +++ b/test/routines/level1/xaxpy.hpp @@ -75,12 +75,12 @@ class TestXaxpy { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Axpy(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -88,12 +88,12 @@ class TestXaxpy { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXaxpy(args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -102,15 +102,15 @@ class TestXaxpy { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXaxpy(args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp index 753f0da5..55980f30 100644 --- a/test/routines/level1/xcopy.hpp +++ b/test/routines/level1/xcopy.hpp @@ -74,12 +74,12 @@ class TestXcopy { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Copy<T>(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -87,12 +87,12 @@ class TestXcopy { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXcopy<T>(args.n, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -101,15 +101,15 @@ class TestXcopy { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXcopy(args.n, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp index 8127247d..1ea69c17 100644 --- a/test/routines/level1/xdot.hpp +++ b/test/routines/level1/xdot.hpp @@ -78,13 +78,13 @@ class TestXdot { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dot<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].scalar(), args.dot_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -92,13 +92,13 @@ class TestXdot { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdot<T>(args.n, - buffers.scalar, args.dot_offset, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].scalar, args.dot_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -107,18 +107,18 @@ class TestXdot { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXdot(args.n, scalar_cpu, args.dot_offset, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp index 96d97dc4..00dcf7c2 100644 --- a/test/routines/level1/xdotc.hpp +++ b/test/routines/level1/xdotc.hpp @@ -78,13 +78,13 @@ class TestXdotc { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotc<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].scalar(), args.dot_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -92,13 +92,13 @@ class TestXdotc { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotc<T>(args.n, - buffers.scalar, args.dot_offset, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].scalar, args.dot_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -107,18 +107,18 @@ class TestXdotc { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXdotc(args.n, scalar_cpu, args.dot_offset, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp index 70c7fceb..512de985 100644 --- a/test/routines/level1/xdotu.hpp +++ b/test/routines/level1/xdotu.hpp @@ -78,13 +78,13 @@ class TestXdotu { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotu<T>(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].scalar(), args.dot_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -92,13 +92,13 @@ class TestXdotu { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotu<T>(args.n, - buffers.scalar, args.dot_offset, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].scalar, args.dot_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -107,18 +107,18 @@ class TestXdotu { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXdotu(args.n, scalar_cpu, args.dot_offset, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp index ce33fe59..20f75226 100644 --- a/test/routines/level1/xnrm2.hpp +++ b/test/routines/level1/xnrm2.hpp @@ -74,12 +74,12 @@ class TestXnrm2 { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Nrm2<T>(args.n, - buffers.scalar(), args.nrm2_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers[0].scalar(), args.nrm2_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -87,12 +87,12 @@ class TestXnrm2 { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXnrm2<T>(args.n, - buffers.scalar, args.nrm2_offset, - buffers.x_vec, args.x_offset, args.x_inc, + buffers[0].scalar, args.nrm2_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -101,15 +101,15 @@ class TestXnrm2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXnrm2(args.n, scalar_cpu, args.nrm2_offset, x_vec_cpu, args.x_offset, args.x_inc); - buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp index d89688b4..e2600834 100644 --- a/test/routines/level1/xscal.hpp +++ b/test/routines/level1/xscal.hpp @@ -71,11 +71,11 @@ class TestXscal { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Scal(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers[0].x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -83,11 +83,11 @@ class TestXscal { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXscal(args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, + buffers[0].x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -96,12 +96,12 @@ class TestXscal { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXscal(args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp index 49b0d3d0..b9f06eb7 100644 --- a/test/routines/level1/xswap.hpp +++ b/test/routines/level1/xswap.hpp @@ -74,12 +74,12 @@ class TestXswap { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Swap<T>(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -87,12 +87,12 @@ class TestXswap { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXswap<T>(args.n, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -101,16 +101,16 @@ class TestXswap { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXswap(args.n, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp index f371b9a7..57c16104 100644 --- a/test/routines/level2/xgbmv.hpp +++ b/test/routines/level2/xgbmv.hpp @@ -86,14 +86,14 @@ class TestXgbmv { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gbmv(args.layout, args.a_transpose, args.m, args.n, args.kl, args.ku, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -101,15 +101,15 @@ class TestXgbmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.x_vec, args.x_offset, args.x_inc, args.beta, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -118,20 +118,20 @@ class TestXgbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXgbmv(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp index 2442be4c..3c56c405 100644 --- a/test/routines/level2/xgemv.hpp +++ b/test/routines/level2/xgemv.hpp @@ -86,14 +86,14 @@ class TestXgemv { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -101,15 +101,15 @@ class TestXgemv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.x_vec, args.x_offset, args.x_inc, args.beta, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -118,20 +118,20 @@ class TestXgemv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXgemv(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), args.m, args.n, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp index 3e7ccbc3..f9a6fefd 100644 --- a/test/routines/level2/xger.hpp +++ b/test/routines/level2/xger.hpp @@ -82,14 +82,14 @@ class TestXger { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Ger(args.layout, args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -97,14 +97,14 @@ class TestXger { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXger(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, - buffers.a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -113,19 +113,19 @@ class TestXger { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXger(convertToCBLAS(args.layout), args.m, args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp index d880ae1f..ddc9030a 100644 --- a/test/routines/level2/xgerc.hpp +++ b/test/routines/level2/xgerc.hpp @@ -82,14 +82,14 @@ class TestXgerc { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gerc(args.layout, args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -97,14 +97,14 @@ class TestXgerc { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgerc(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, - buffers.a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -113,19 +113,19 @@ class TestXgerc { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXgerc(convertToCBLAS(args.layout), args.m, args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp index 1735e42a..8d5b8589 100644 --- a/test/routines/level2/xgeru.hpp +++ b/test/routines/level2/xgeru.hpp @@ -82,14 +82,14 @@ class TestXgeru { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Geru(args.layout, args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -97,14 +97,14 @@ class TestXgeru { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgeru(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, - buffers.a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -113,19 +113,19 @@ class TestXgeru { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXgeru(convertToCBLAS(args.layout), args.m, args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp index 99538bf1..50130359 100644 --- a/test/routines/level2/xhbmv.hpp +++ b/test/routines/level2/xhbmv.hpp @@ -80,14 +80,14 @@ class TestXhbmv { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hbmv(args.layout, args.triangle, args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXhbmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.x_vec, args.x_offset, args.x_inc, args.beta, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -112,20 +112,20 @@ class TestXhbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXhbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.kl, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp index 3792cb66..f69b031c 100644 --- a/test/routines/level2/xhemv.hpp +++ b/test/routines/level2/xhemv.hpp @@ -80,14 +80,14 @@ class TestXhemv { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemv(args.layout, args.triangle, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXhemv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.x_vec, args.x_offset, args.x_inc, args.beta, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -112,20 +112,20 @@ class TestXhemv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXhemv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp index c58eb189..c3d809bf 100644 --- a/test/routines/level2/xher.hpp +++ b/test/routines/level2/xher.hpp @@ -76,13 +76,13 @@ class TestXher { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Her(args.layout, args.triangle, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -90,14 +90,14 @@ class TestXher { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXher(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -106,17 +106,17 @@ class TestXher { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXher(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp index 8a7eb0b6..7ddf9ed1 100644 --- a/test/routines/level2/xher2.hpp +++ b/test/routines/level2/xher2.hpp @@ -80,14 +80,14 @@ class TestXher2 { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Her2(args.layout, args.triangle, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXher2 { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXher2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, - buffers.a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -112,20 +112,20 @@ class TestXher2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXher2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp index 0862b619..7fae80b8 100644 --- a/test/routines/level2/xhpmv.hpp +++ b/test/routines/level2/xhpmv.hpp @@ -80,14 +80,14 @@ class TestXhpmv { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpmv(args.layout, args.triangle, args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].ap_mat(), args.ap_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXhpmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhpmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.ap_mat, args.ap_offset, - buffers.x_vec, args.x_offset, args.x_inc, args.beta, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].ap_mat, args.ap_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -112,20 +112,20 @@ class TestXhpmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXhpmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, ap_mat_cpu, args.ap_offset, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp index 5b454174..a46cb8e6 100644 --- a/test/routines/level2/xhpr.hpp +++ b/test/routines/level2/xhpr.hpp @@ -76,13 +76,13 @@ class TestXhpr { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr(args.layout, args.triangle, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -90,14 +90,14 @@ class TestXhpr { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhpr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.ap_mat, args.ap_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -106,17 +106,17 @@ class TestXhpr { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXhpr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, ap_mat_cpu, args.ap_offset); - buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp index b770da2e..08f12768 100644 --- a/test/routines/level2/xhpr2.hpp +++ b/test/routines/level2/xhpr2.hpp @@ -80,14 +80,14 @@ class TestXhpr2 { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr2(args.layout, args.triangle, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers[0].ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXhpr2 { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhpr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, - buffers.ap_mat, args.ap_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, + buffers[0].ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -112,20 +112,20 @@ class TestXhpr2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXhpr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, ap_mat_cpu, args.ap_offset); - buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp index 7a836170..a45dbe8f 100644 --- a/test/routines/level2/xsbmv.hpp +++ b/test/routines/level2/xsbmv.hpp @@ -80,14 +80,14 @@ class TestXsbmv { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Sbmv(args.layout, args.triangle, args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXsbmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.x_vec, args.x_offset, args.x_inc, args.beta, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -112,20 +112,20 @@ class TestXsbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXsbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.kl, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp index 352c8cfd..a455f652 100644 --- a/test/routines/level2/xspmv.hpp +++ b/test/routines/level2/xspmv.hpp @@ -80,14 +80,14 @@ class TestXspmv { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spmv(args.layout, args.triangle, args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].ap_mat(), args.ap_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXspmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXspmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.ap_mat, args.ap_offset, - buffers.x_vec, args.x_offset, args.x_inc, args.beta, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].ap_mat, args.ap_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -112,20 +112,20 @@ class TestXspmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXspmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, ap_mat_cpu, args.ap_offset, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp index 988bcdc2..ab9ab85f 100644 --- a/test/routines/level2/xspr.hpp +++ b/test/routines/level2/xspr.hpp @@ -76,13 +76,13 @@ class TestXspr { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr(args.layout, args.triangle, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -90,14 +90,14 @@ class TestXspr { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXspr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.ap_mat, args.ap_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -106,17 +106,17 @@ class TestXspr { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXspr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, ap_mat_cpu, args.ap_offset); - buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp index ee517bc1..a73975a5 100644 --- a/test/routines/level2/xspr2.hpp +++ b/test/routines/level2/xspr2.hpp @@ -80,14 +80,14 @@ class TestXspr2 { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr2(args.layout, args.triangle, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers[0].ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXspr2 { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXspr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, - buffers.ap_mat, args.ap_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, + buffers[0].ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -112,20 +112,20 @@ class TestXspr2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXspr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, ap_mat_cpu, args.ap_offset); - buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp index 5eecfb74..c93492ed 100644 --- a/test/routines/level2/xsymv.hpp +++ b/test/routines/level2/xsymv.hpp @@ -80,14 +80,14 @@ class TestXsymv { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Symv(args.layout, args.triangle, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXsymv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsymv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.x_vec, args.x_offset, args.x_inc, args.beta, - buffers.y_vec, args.y_offset, args.y_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, + buffers[0].y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -112,20 +112,20 @@ class TestXsymv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXsymv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp index ac4ee1ff..ac2c5e98 100644 --- a/test/routines/level2/xsyr.hpp +++ b/test/routines/level2/xsyr.hpp @@ -76,13 +76,13 @@ class TestXsyr { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr(args.layout, args.triangle, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -90,14 +90,14 @@ class TestXsyr { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -106,17 +106,17 @@ class TestXsyr { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXsyr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp index 43644883..9f8d315b 100644 --- a/test/routines/level2/xsyr2.hpp +++ b/test/routines/level2/xsyr2.hpp @@ -80,14 +80,14 @@ class TestXsyr2 { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2(args.layout, args.triangle, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXsyr2 { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec, args.x_offset, args.x_inc, - buffers.y_vec, args.y_offset, args.y_inc, - buffers.a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, + buffers[0].y_vec, args.y_offset, args.y_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -112,20 +112,20 @@ class TestXsyr2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXsyr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp index ab9244af..2d964fda 100644 --- a/test/routines/level2/xtbmv.hpp +++ b/test/routines/level2/xtbmv.hpp @@ -75,13 +75,13 @@ class TestXtbmv { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, args.kl, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -89,7 +89,7 @@ class TestXtbmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtbmv<T>(convertToCLBLAS(args.layout), @@ -97,8 +97,8 @@ class TestXtbmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, args.kl, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.x_vec, args.x_offset, args.x_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -107,11 +107,11 @@ class TestXtbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXtbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), @@ -119,7 +119,7 @@ class TestXtbmv { args.n, args.kl, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp index 3821e1a4..fcfd86bf 100644 --- a/test/routines/level2/xtpmv.hpp +++ b/test/routines/level2/xtpmv.hpp @@ -75,13 +75,13 @@ class TestXtpmv { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers[0].ap_mat(), args.ap_offset, + buffers[0].x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -89,7 +89,7 @@ class TestXtpmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtpmv<T>(convertToCLBLAS(args.layout), @@ -97,8 +97,8 @@ class TestXtpmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers.ap_mat, args.ap_offset, - buffers.x_vec, args.x_offset, args.x_inc, + buffers[0].ap_mat, args.ap_offset, + buffers[0].x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -107,11 +107,11 @@ class TestXtpmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXtpmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), @@ -119,7 +119,7 @@ class TestXtpmv { args.n, ap_mat_cpu, args.ap_offset, x_vec_cpu, args.x_offset, args.x_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp index 7211c757..4e209584 100644 --- a/test/routines/level2/xtrmv.hpp +++ b/test/routines/level2/xtrmv.hpp @@ -75,13 +75,13 @@ class TestXtrmv { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -89,7 +89,7 @@ class TestXtrmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrmv<T>(convertToCLBLAS(args.layout), @@ -97,8 +97,8 @@ class TestXtrmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.x_vec, args.x_offset, args.x_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -107,11 +107,11 @@ class TestXtrmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXtrmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), @@ -119,7 +119,7 @@ class TestXtrmv { args.n, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp index 78b9672f..090684b1 100644 --- a/test/routines/level2/xtrsv.hpp +++ b/test/routines/level2/xtrsv.hpp @@ -90,13 +90,13 @@ class TestXtrsv { } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -104,7 +104,7 @@ class TestXtrsv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrsv<T>(convertToCLBLAS(args.layout), @@ -112,8 +112,8 @@ class TestXtrsv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.x_vec, args.x_offset, args.x_inc, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -122,11 +122,11 @@ class TestXtrsv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXtrsv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), @@ -134,7 +134,7 @@ class TestXtrsv { args.n, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc); - buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp index 1b12fb1c..5b220889 100644 --- a/test/routines/level3/xgemm.hpp +++ b/test/routines/level3/xgemm.hpp @@ -88,14 +88,14 @@ class TestXgemm { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, args.m, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta, + buffers[0].c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -103,16 +103,16 @@ class TestXgemm { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgemm(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.b_mat, args.b_offset, args.b_ld, args.beta, - buffers.c_mat, args.c_offset, args.c_ld, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].b_mat, args.b_offset, args.b_ld, args.beta, + buffers[0].c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -121,13 +121,13 @@ class TestXgemm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXgemm(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), convertToCBLAS(args.b_transpose), @@ -135,7 +135,7 @@ class TestXgemm { a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp index 76550b15..e6e8724f 100644 --- a/test/routines/level3/xhemm.hpp +++ b/test/routines/level3/xhemm.hpp @@ -88,14 +88,14 @@ class TestXhemm { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemm(args.layout, args.side, args.triangle, args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta, + buffers[0].c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -103,16 +103,16 @@ class TestXhemm { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhemm(convertToCLBLAS(args.layout), convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.b_mat, args.b_offset, args.b_ld, args.beta, - buffers.c_mat, args.c_offset, args.c_ld, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].b_mat, args.b_offset, args.b_ld, args.beta, + buffers[0].c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -121,13 +121,13 @@ class TestXhemm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXhemm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), @@ -135,7 +135,7 @@ class TestXhemm { a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp index 5ca3aac6..749eca11 100644 --- a/test/routines/level3/xher2k.hpp +++ b/test/routines/level3/xher2k.hpp @@ -86,15 +86,15 @@ class TestXher2k { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; auto status = Her2k(args.layout, args.triangle, args.a_transpose, args.n, args.k, alpha2, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta, + buffers[0].c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -102,7 +102,7 @@ class TestXher2k { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; @@ -110,9 +110,9 @@ class TestXher2k { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, alpha2, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.b_mat, args.b_offset, args.b_ld, args.beta, - buffers.c_mat, args.c_offset, args.c_ld, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].b_mat, args.b_offset, args.b_ld, args.beta, + buffers[0].c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -121,13 +121,13 @@ class TestXher2k { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); auto alpha2 = T{args.alpha, args.alpha}; cblasXher2k(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), @@ -136,7 +136,7 @@ class TestXher2k { a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp index e93d887a..e9193847 100644 --- a/test/routines/level3/xherk.hpp +++ b/test/routines/level3/xherk.hpp @@ -79,13 +79,13 @@ class TestXherk { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Herk(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers[0].a_mat(), args.a_offset, args.a_ld, args.beta, + buffers[0].c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -93,15 +93,15 @@ class TestXherk { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXherk(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, args.beta, - buffers.c_mat, args.c_offset, args.c_ld, + buffers[0].a_mat, args.a_offset, args.a_ld, args.beta, + buffers[0].c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -110,18 +110,18 @@ class TestXherk { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXherk(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp index 9d127e26..bcd74fda 100644 --- a/test/routines/level3/xsymm.hpp +++ b/test/routines/level3/xsymm.hpp @@ -88,14 +88,14 @@ class TestXsymm { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Symm(args.layout, args.side, args.triangle, args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta, + buffers[0].c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -103,16 +103,16 @@ class TestXsymm { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsymm(convertToCLBLAS(args.layout), convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.b_mat, args.b_offset, args.b_ld, args.beta, - buffers.c_mat, args.c_offset, args.c_ld, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].b_mat, args.b_offset, args.b_ld, args.beta, + buffers[0].c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -121,13 +121,13 @@ class TestXsymm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXsymm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), @@ -135,7 +135,7 @@ class TestXsymm { a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp index d1bdac56..c722e0cf 100644 --- a/test/routines/level3/xsyr2k.hpp +++ b/test/routines/level3/xsyr2k.hpp @@ -86,14 +86,14 @@ class TestXsyr2k { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2k(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta, + buffers[0].c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -101,16 +101,16 @@ class TestXsyr2k { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyr2k(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.b_mat, args.b_offset, args.b_ld, args.beta, - buffers.c_mat, args.c_offset, args.c_ld, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].b_mat, args.b_offset, args.b_ld, args.beta, + buffers[0].c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -119,13 +119,13 @@ class TestXsyr2k { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXsyr2k(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), @@ -133,7 +133,7 @@ class TestXsyr2k { a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp index 1330924e..7d5c2039 100644 --- a/test/routines/level3/xsyrk.hpp +++ b/test/routines/level3/xsyrk.hpp @@ -79,13 +79,13 @@ class TestXsyrk { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syrk(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers[0].a_mat(), args.a_offset, args.a_ld, args.beta, + buffers[0].c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -93,15 +93,15 @@ class TestXsyrk { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyrk(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, args.beta, - buffers.c_mat, args.c_offset, args.c_ld, + buffers[0].a_mat, args.a_offset, args.a_ld, args.beta, + buffers[0].c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -110,18 +110,18 @@ class TestXsyrk { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXsyrk(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp index 7c5bd842..50cca6f8 100644 --- a/test/routines/level3/xtrmm.hpp +++ b/test/routines/level3/xtrmm.hpp @@ -79,13 +79,13 @@ class TestXtrmm { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -93,7 +93,7 @@ class TestXtrmm { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrmm(convertToCLBLAS(args.layout), @@ -102,8 +102,8 @@ class TestXtrmm { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.m, args.n, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.b_mat, args.b_offset, args.b_ld, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].b_mat, args.b_offset, args.b_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -112,11 +112,11 @@ class TestXtrmm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); cblasXtrmm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), @@ -125,7 +125,7 @@ class TestXtrmm { args.m, args.n, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld); - buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + buffers[0].b_mat.Write(queue, args.b_size, b_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp index a70ef03f..91f91d0b 100644 --- a/test/routines/level3/xtrsm.hpp +++ b/test/routines/level3/xtrsm.hpp @@ -91,13 +91,13 @@ class TestXtrsm { } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -105,7 +105,7 @@ class TestXtrsm { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrsm(convertToCLBLAS(args.layout), @@ -114,8 +114,8 @@ class TestXtrsm { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.m, args.n, args.alpha, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.b_mat, args.b_offset, args.b_ld, + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].b_mat, args.b_offset, args.b_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast<StatusCode>(status); @@ -124,11 +124,11 @@ class TestXtrsm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0)); std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0)); - buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); cblasXtrsm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), @@ -137,7 +137,7 @@ class TestXtrsm { args.m, args.n, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld); - buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + buffers[0].b_mat.Write(queue, args.b_size, b_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp index b470dbf3..2cb1b2ce 100644 --- a/test/routines/levelx/xinvert.hpp +++ b/test/routines/levelx/xinvert.hpp @@ -173,14 +173,14 @@ class TestXinvert { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { try { auto event = cl_event{}; auto inverter = Xinvert<T>(queue, &event); inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, args.n, args.m, - buffers.a_mat, args.a_offset, args.a_ld, - buffers.b_mat); + buffers[0].a_mat, args.a_offset, args.a_ld, + buffers[0].b_mat); clWaitForEvents(1, &event); clReleaseEvent(event); } catch (...) { return DispatchException(); } @@ -189,12 +189,12 @@ class TestXinvert { // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { + return RunReference(args, buffers[0], queue); } - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { + return RunReference(args, buffers[0], queue); } // Describes how to download the results of the computation (more importantly: which buffer) diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp index d1064d0c..69f0b2b6 100644 --- a/test/routines/levelx/xomatcopy.hpp +++ b/test/routines/levelx/xomatcopy.hpp @@ -133,13 +133,13 @@ class TestXomatcopy { std::vector<T>&, std::vector<T>&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Omatcopy<T>(args.layout, args.a_transpose, args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, + buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers[0].b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -147,12 +147,12 @@ class TestXomatcopy { // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. - static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { + return RunReference(args, buffers[0], queue); } - static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { - return RunReference(args, buffers, queue); + static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) { + return RunReference(args, buffers[0], queue); } // Describes how to download the results of the computation (more importantly: which buffer) |