51 files changed, 663 insertions, 626 deletions
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 330db597..58dc3b27 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -73,6 +73,7 @@ constexpr auto kArgAsumOffset = "offasum";
 constexpr auto kArgImaxOffset = "offimax";
 constexpr auto kArgAlpha = "alpha";
 constexpr auto kArgBeta = "beta";
+constexpr auto kArgBatchCount = "batch_count";
 
 // The tuner-specific arguments in string form
 constexpr auto kArgFraction = "fraction";
@@ -156,6 +157,8 @@ struct Arguments {
   size_t imax_offset = 0;
   T alpha = ConstantOne<T>();
   T beta = ConstantOne<T>();
+  size_t batch_count = 1;
+  // Sizes
   size_t x_size = 1;
   size_t y_size = 1;
   size_t a_size = 1;
diff --git a/test/correctness/misc/override_parameters.cpp b/test/correctness/misc/override_parameters.cpp
index e6eebef7..c6c70d9f 100644
--- a/test/correctness/misc/override_parameters.cpp
+++ b/test/correctness/misc/override_parameters.cpp
@@ -88,7 +88,7 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st
   device_b.Write(queue, host_b.size(), host_b);
   device_c.Write(queue, host_c.size(), host_c);
   auto dummy = Buffer<T>(context, 1);
-  auto buffers = Buffers<T>{dummy, dummy, device_a, device_b, device_c, dummy, dummy};
+  auto buffers = std::vector<Buffers<T>>{Buffers<T>{dummy, dummy, device_a, device_b, device_c, dummy, dummy}};
 
   // Loops over the valid combinations: run before and run afterwards
   fprintf(stdout, "* Testing OverrideParameters for '%s'\n", routine_name.c_str());
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 505b3b36..fcb2eceb 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -27,6 +27,7 @@ template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kIncr
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixDims = { 7, 64 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 256 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBandSizes = { 4, 19 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBatchCounts = { 1, 3 };
 
 // Test settings for the invalid tests
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kInvalidIncrements = { 0, 1 };
@@ -81,15 +82,16 @@ TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool si
   const auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
   const auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
   const auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
+  const auto max_batch_count = *std::max_element(kBatchCounts.begin(), kBatchCounts.end());
 
   // Creates test input data
-  x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
-  y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
-  a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
-  scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset);
+  x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
+  y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
+  a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
+  scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset);
   std::mt19937 mt(kSeed);
   std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
   PopulateVector(x_source_, mt, dist);
@@ -124,21 +126,24 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
                   ap_source_, scalar_source_);
 
     // Set-up for the CLBlast run
-    auto x_vec2 = Buffer<T>(context_, args.x_size);
-    auto y_vec2 = Buffer<T>(context_, args.y_size);
-    auto a_mat2 = Buffer<T>(context_, args.a_size);
-    auto b_mat2 = Buffer<T>(context_, args.b_size);
-    auto c_mat2 = Buffer<T>(context_, args.c_size);
-    auto ap_mat2 = Buffer<T>(context_, args.ap_size);
-    auto scalar2 = Buffer<T>(context_, args.scalar_size);
-    x_vec2.Write(queue_, args.x_size, x_source_);
-    y_vec2.Write(queue_, args.y_size, y_source_);
-    a_mat2.Write(queue_, args.a_size, a_source_);
-    b_mat2.Write(queue_, args.b_size, b_source_);
-    c_mat2.Write(queue_, args.c_size, c_source_);
-    ap_mat2.Write(queue_, args.ap_size, ap_source_);
-    scalar2.Write(queue_, args.scalar_size, scalar_source_);
-    auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
+    auto buffers2 = std::vector<Buffers<T>>();
+    for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+      auto x_vec2 = Buffer<T>(context_, args.x_size);
+      auto y_vec2 = Buffer<T>(context_, args.y_size);
+      auto a_mat2 = Buffer<T>(context_, args.a_size);
+      auto b_mat2 = Buffer<T>(context_, args.b_size);
+      auto c_mat2 = Buffer<T>(context_, args.c_size);
+      auto ap_mat2 = Buffer<T>(context_, args.ap_size);
+      auto scalar2 = Buffer<T>(context_, args.scalar_size);
+      x_vec2.Write(queue_, args.x_size, &x_source_[batch * args.x_size]);
+      y_vec2.Write(queue_, args.y_size, &y_source_[batch * args.y_size]);
+      a_mat2.Write(queue_, args.a_size, &a_source_[batch * args.a_size]);
+      b_mat2.Write(queue_, args.b_size, &b_source_[batch * args.b_size]);
+      c_mat2.Write(queue_, args.c_size, &c_source_[batch * args.c_size]);
+      ap_mat2.Write(queue_, args.ap_size, &ap_source_[batch * args.ap_size]);
+      scalar2.Write(queue_, args.scalar_size, &scalar_source_[batch * args.scalar_size]);
+      buffers2.push_back(Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2});
+    }
 
     // Runs CLBlast
     if (verbose_) {
@@ -158,21 +163,24 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     }
 
     // Set-up for the reference run
-    auto x_vec1 = Buffer<T>(context_, args.x_size);
-    auto y_vec1 = Buffer<T>(context_, args.y_size);
-    auto a_mat1 = Buffer<T>(context_, args.a_size);
-    auto b_mat1 = Buffer<T>(context_, args.b_size);
-    auto c_mat1 = Buffer<T>(context_, args.c_size);
-    auto ap_mat1 = Buffer<T>(context_, args.ap_size);
-    auto scalar1 = Buffer<T>(context_, args.scalar_size);
-    x_vec1.Write(queue_, args.x_size, x_source_);
-    y_vec1.Write(queue_, args.y_size, y_source_);
-    a_mat1.Write(queue_, args.a_size, a_source_);
-    b_mat1.Write(queue_, args.b_size, b_source_);
-    c_mat1.Write(queue_, args.c_size, c_source_);
-    ap_mat1.Write(queue_, args.ap_size, ap_source_);
-    scalar1.Write(queue_, args.scalar_size, scalar_source_);
-    auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
+    auto buffers1 = std::vector<Buffers<T>>();
+    for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+      auto x_vec1 = Buffer<T>(context_, args.x_size);
+      auto y_vec1 = Buffer<T>(context_, args.y_size);
+      auto a_mat1 = Buffer<T>(context_, args.a_size);
+      auto b_mat1 = Buffer<T>(context_, args.b_size);
+      auto c_mat1 = Buffer<T>(context_, args.c_size);
+      auto ap_mat1 = Buffer<T>(context_, args.ap_size);
+      auto scalar1 = Buffer<T>(context_, args.scalar_size);
+      x_vec1.Write(queue_, args.x_size, &x_source_[batch * args.x_size]);
+      y_vec1.Write(queue_, args.y_size, &y_source_[batch * args.y_size]);
+      a_mat1.Write(queue_, args.a_size, &a_source_[batch * args.a_size]);
+      b_mat1.Write(queue_, args.b_size, &b_source_[batch * args.b_size]);
+      c_mat1.Write(queue_, args.c_size, &c_source_[batch * args.c_size]);
+      ap_mat1.Write(queue_, args.ap_size, &ap_source_[batch * args.ap_size]);
+      scalar1.Write(queue_, args.scalar_size, &scalar_source_[batch * args.scalar_size]);
+      buffers1.push_back(Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1});
+    }
 
     // Runs the reference code
     if (verbose_) {
@@ -189,46 +197,55 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
       continue;
     }
 
-    // Downloads the results
-    auto result1 = get_result_(args, buffers1, queue_);
-    auto result2 = get_result_(args, buffers2, queue_);
-
-    // Computes the L2 error
-    const auto kErrorMarginL2 = getL2ErrorMargin<T>();
+    // Error checking for each batch
+    auto errors = size_t{0};
     auto l2error = 0.0;
-    for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
-      for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
-        auto index = get_index_(args, id1, id2);
-        l2error += SquaredDifference(result1[index], result2[index]);
+    for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+
+      // Downloads the results
+      auto result1 = get_result_(args, buffers1[batch], queue_);
+      auto result2 = get_result_(args, buffers2[batch], queue_);
+
+      // Computes the L2 error
+      auto l2error_batch = 0.0;
+      const auto kErrorMarginL2 = getL2ErrorMargin<T>();
+      for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
+        for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
+          auto index = get_index_(args, id1, id2);
+          l2error_batch += SquaredDifference(result1[index], result2[index]);
+        }
       }
-    }
-    l2error /= (get_id1_(args) * get_id2_(args));
-
-    // Checks for differences in the output
-    auto errors = size_t{0};
-    for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
-      for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
-        auto index = get_index_(args, id1, id2);
-        if (!TestSimilarity(result1[index], result2[index])) {
-          if (l2error >= kErrorMarginL2) { errors++; }
-          if (verbose_) {
-            if (get_id2_(args) == 1) { fprintf(stdout, "\n   Error at index %zu: ", id1); }
-            else { fprintf(stdout, "\n   Error at %zu,%zu: ", id1, id2); }
-            fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str());
-            fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str());
-            if (l2error < kErrorMarginL2) {
-              fprintf(stdout, " - error suppressed by a low total L2 error\n");
+      l2error_batch /= static_cast<double>(get_id1_(args) * get_id2_(args));
+      l2error += l2error_batch;
+
+      // Checks for differences in the output
+      for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
+        for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
+          auto index = get_index_(args, id1, id2);
+          if (!TestSimilarity(result1[index], result2[index])) {
+            if (l2error_batch >= kErrorMarginL2) { errors++; }
+            if (verbose_) {
+              if (get_id2_(args) == 1) { fprintf(stdout, "\n   Error at index %zu: ", id1); }
+              else { fprintf(stdout, "\n   Error at %zu,%zu: ", id1, id2); }
+              fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str());
+              fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str());
+              if (l2error_batch < kErrorMarginL2) {
+                fprintf(stdout, " - error suppressed by a low total L2 error\n");
+              }
             }
           }
         }
       }
     }
+    l2error /= static_cast<double>(args.batch_count);
+
+    // Report the results
     if (verbose_ && errors > 0) {
-      fprintf(stdout, "\n   Combined L2 error: %.2e\n   ", l2error);
+      fprintf(stdout, "\n   Combined average L2 error: %.2e\n   ", l2error);
     }
 
     // Tests the error count (should be zero)
-    TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
+    TestErrorCount(errors, get_id1_(args)*get_id2_(args)*args.batch_count, args);
   }
   TestEnd();
 }
@@ -255,36 +272,40 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
 
     // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
     // want to be able to create invalid buffers (no error checking here).
-    auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
-    auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
-    auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
-    auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
-    auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
-    auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
-    auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr);
-    auto x_vec1 = Buffer<T>(x1);
-    auto y_vec1 = Buffer<T>(y1);
-    auto a_mat1 = Buffer<T>(a1);
-    auto b_mat1 = Buffer<T>(b1);
-    auto c_mat1 = Buffer<T>(c1);
-    auto ap_mat1 = Buffer<T>(ap1);
-    auto scalar1 = Buffer<T>(d1);
-    auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
-    auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
-    auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
-    auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
-    auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
-    auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
-    auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr);
-    auto x_vec2 = Buffer<T>(x2);
-    auto y_vec2 = Buffer<T>(y2);
-    auto a_mat2 = Buffer<T>(a2);
-    auto b_mat2 = Buffer<T>(b2);
-    auto c_mat2 = Buffer<T>(c2);
-    auto ap_mat2 = Buffer<T>(ap2);
-    auto scalar2 = Buffer<T>(d2);
-    auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
-    auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
+    auto buffers1 = std::vector<Buffers<T>>();
+    auto buffers2 = std::vector<Buffers<T>>();
+    for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+      auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
+      auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
+      auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
+      auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
+      auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+      auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
+      auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr);
+      auto x_vec1 = Buffer<T>(x1);
+      auto y_vec1 = Buffer<T>(y1);
+      auto a_mat1 = Buffer<T>(a1);
+      auto b_mat1 = Buffer<T>(b1);
+      auto c_mat1 = Buffer<T>(c1);
+      auto ap_mat1 = Buffer<T>(ap1);
+      auto scalar1 = Buffer<T>(d1);
+      auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
+      auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
+      auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
+      auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
+      auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+      auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
+      auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr);
+      auto x_vec2 = Buffer<T>(x2);
+      auto y_vec2 = Buffer<T>(y2);
+      auto a_mat2 = Buffer<T>(a2);
+      auto b_mat2 = Buffer<T>(b2);
+      auto c_mat2 = Buffer<T>(c2);
+      auto ap_mat2 = Buffer<T>(ap2);
+      auto scalar2 = Buffer<T>(d2);
+      buffers1.push_back(Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1});
+      buffers2.push_back(Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2});
+    }
 
     // Runs CLBlast
     if (verbose_) {
diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp
index ee795aad..e675fa9b 100644
--- a/test/correctness/testblas.hpp
+++ b/test/correctness/testblas.hpp
@@ -56,6 +56,7 @@ class TestBlas: public Tester<T,U> {
   static const std::vector<size_t> kMatrixDims;
   static const std::vector<size_t> kMatrixVectorDims;
   static const std::vector<size_t> kBandSizes;
+  static const std::vector<size_t> kBatchCounts;
   const std::vector<size_t> kOffsets;
   const std::vector<U> kAlphaValues;
   const std::vector<U> kBetaValues;
@@ -78,7 +79,7 @@ class TestBlas: public Tester<T,U> {
                                          std::vector<T>&, std::vector<T>&,
                                          std::vector<T>&, std::vector<T>&, std::vector<T>&,
                                          std::vector<T>&, std::vector<T>&)>;
-  using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
+  using Routine = std::function<StatusCode(const Arguments<U>&, std::vector<Buffers<T>>&, Queue&)>;
   using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers<T>&, Queue&)>;
   using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
   using ResultIterator = std::function<size_t(const Arguments<U>&)>;
@@ -183,6 +184,7 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
   auto imax_offsets = std::vector<size_t>{args.imax_offset};
   auto alphas = std::vector<U>{args.alpha};
   auto betas = std::vector<U>{args.beta};
+  auto batch_counts = std::vector<size_t>{args.batch_count};
   auto x_sizes = std::vector<size_t>{args.x_size};
   auto y_sizes = std::vector<size_t>{args.y_size};
   auto a_sizes = std::vector<size_t>{args.a_size};
@@ -226,6 +228,7 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
     if (option == kArgImaxOffset) { imax_offsets = tester.kOffsets; }
     if (option == kArgAlpha) { alphas = tester.kAlphaValues; }
     if (option == kArgBeta) { betas = tester.kBetaValues; }
+    if (option == kArgBatchCount) { batch_counts = tester.kBatchCounts; }
 
     if (option == kArgXOffset) { x_sizes = tester.kVecSizes; }
     if (option == kArgYOffset) { y_sizes = tester.kVecSizes; }
@@ -268,8 +271,10 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
                                                     for (auto &imax_offset: imax_offsets) { r_args.imax_offset = imax_offset;
                                                       for (auto &alpha: alphas) { r_args.alpha = alpha;
                                                         for (auto &beta: betas) { r_args.beta = beta;
-                                                          C::SetSizes(r_args);
-                                                          regular_test_vector.push_back(r_args);
+                                                          for (auto &batch_count: batch_counts) { r_args.batch_count = batch_count;
+                                                            C::SetSizes(r_args);
+                                                            regular_test_vector.push_back(r_args);
+                                                          }
                                                         }
                                                       }
                                                     }
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index cbfc5bb2..40784fdb 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -367,6 +367,7 @@ std::string Tester<T,U>::GetOptionsString(const Arguments<U> &args) {
     if (o == kArgDotOffset){ result += kArgDotOffset + equals + ToString(args.dot_offset) + " "; }
     if (o == kArgAlpha)    { result += kArgAlpha + equals + ToString(args.alpha) + " "; }
     if (o == kArgBeta)     { result += kArgBeta + equals + ToString(args.beta) + " "; }
+    if (o == kArgBatchCount){result += kArgBatchCount + equals + ToString(args.batch_count) + " "; }
   }
   return result;
 }
diff --git a/test/performance/client.cpp b/test/performance/client.cpp
index 16b44b5a..b1d5b718 100644
--- a/test/performance/client.cpp
+++ b/test/performance/client.cpp
@@ -94,6 +94,9 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le
     // Scalar values 
     if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar<U>()); }
     if (o == kArgBeta)  { args.beta  = GetArgument(command_line_args, help, kArgBeta, GetScalar<U>()); }
+
+    // Batch arguments
+    if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, size_t{1}); }
   }
 
   // These are the options common to all routines
@@ -174,13 +177,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
     set_sizes(args);
 
     // Populates input host matrices with random data
-    std::vector<T> x_source(args.x_size);
-    std::vector<T> y_source(args.y_size);
-    std::vector<T> a_source(args.a_size);
-    std::vector<T> b_source(args.b_size);
-    std::vector<T> c_source(args.c_size);
-    std::vector<T> ap_source(args.ap_size);
-    std::vector<T> scalar_source(args.scalar_size);
+    std::vector<T> x_source(args.batch_count * args.x_size);
+    std::vector<T> y_source(args.batch_count * args.y_size);
+    std::vector<T> a_source(args.batch_count * args.a_size);
+    std::vector<T> b_source(args.batch_count * args.b_size);
+    std::vector<T> c_source(args.batch_count * args.c_size);
+    std::vector<T> ap_source(args.batch_count * args.ap_size);
+    std::vector<T> scalar_source(args.batch_count * args.scalar_size);
     std::mt19937 mt(kSeed);
     std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
     PopulateVector(x_source, mt, dist);
@@ -192,21 +195,24 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
     PopulateVector(scalar_source, mt, dist);
 
     // Creates the matrices on the device
-    auto x_vec = Buffer<T>(context, args.x_size);
-    auto y_vec = Buffer<T>(context, args.y_size);
-    auto a_mat = Buffer<T>(context, args.a_size);
-    auto b_mat = Buffer<T>(context, args.b_size);
-    auto c_mat = Buffer<T>(context, args.c_size);
-    auto ap_mat = Buffer<T>(context, args.ap_size);
-    auto scalar = Buffer<T>(context, args.scalar_size);
-    x_vec.Write(queue, args.x_size, x_source);
-    y_vec.Write(queue, args.y_size, y_source);
-    a_mat.Write(queue, args.a_size, a_source);
-    b_mat.Write(queue, args.b_size, b_source);
-    c_mat.Write(queue, args.c_size, c_source);
-    ap_mat.Write(queue, args.ap_size, ap_source);
-    scalar.Write(queue, args.scalar_size, scalar_source);
-    auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar};
+    auto buffers = std::vector<Buffers<T>>();
+    for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+      auto x_vec = Buffer<T>(context, args.x_size);
+      auto y_vec = Buffer<T>(context, args.y_size);
+      auto a_mat = Buffer<T>(context, args.a_size);
+      auto b_mat = Buffer<T>(context, args.b_size);
+      auto c_mat = Buffer<T>(context, args.c_size);
+      auto ap_mat = Buffer<T>(context, args.ap_size);
+      auto scalar = Buffer<T>(context, args.scalar_size);
+      x_vec.Write(queue, args.x_size, &x_source[batch * args.x_size]);
+      y_vec.Write(queue, args.y_size, &y_source[batch * args.y_size]);
+      a_mat.Write(queue, args.a_size, &a_source[batch * args.a_size]);
+      b_mat.Write(queue, args.b_size, &b_source[batch * args.b_size]);
+      c_mat.Write(queue, args.c_size, &c_source[batch * args.c_size]);
+      ap_mat.Write(queue, args.ap_size, &ap_source[batch * args.ap_size]);
+      scalar.Write(queue, args.scalar_size, &scalar_source[batch * args.scalar_size]);
+      buffers.push_back(Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar});
+    }
 
     // Runs the routines and collects the timings
     auto timings = std::vector<std::pair<std::string, double>>();
@@ -248,7 +254,7 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
 // value found in the vector of timing results. The return value is in milliseconds.
 template <typename T, typename U>
 double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
-                                   Buffers<T> &buffers, Queue &queue,
+                                   std::vector<Buffers<T>> &buffers, Queue &queue,
                                    Routine run_blas, const std::string &library_name) {
   auto status = StatusCode::kSuccess;
 
@@ -339,6 +345,7 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args,
     else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); }
     else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); }
     else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); }
+    else if (o == kArgBatchCount){integers.push_back(args.batch_count); }
   }
   auto strings = std::vector<std::string>{};
   for (auto &o: options_) {
diff --git a/test/performance/client.hpp b/test/performance/client.hpp
index 4b3e17c7..a8e31419 100644
--- a/test/performance/client.hpp
+++ b/test/performance/client.hpp
@@ -43,7 +43,7 @@ class Client {
   static constexpr auto kSeed = 42; // fixed seed for reproducibility
 
   // Shorthand for the routine-specific functions passed to the tester
-  using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
+  using Routine = std::function<StatusCode(const Arguments<U>&, std::vector<Buffers<T>>&, Queue&)>;
   using SetMetric = std::function<void(Arguments<U>&)>;
   using GetMetric = std::function<size_t(const Arguments<U>&)>;
 
@@ -66,7 +66,7 @@ class Client {
  private:
 
   // Runs a function a given number of times and returns the execution time of the shortest instance
-  double TimedExecution(const size_t num_runs, const Arguments<U> &args, Buffers<T> &buffers,
+  double TimedExecution(const size_t num_runs, const Arguments<U> &args, std::vector<Buffers<T>> &buffers,
                         Queue &queue, Routine run_blas, const std::string &library_name);
 
   // Prints the header of a performance-data table
diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp
index a22f681f..faffff33 100644
--- a/test/routines/level1/xamax.hpp
+++ b/test/routines/level1/xamax.hpp
@@ -74,12 +74,12 @@ class TestXamax {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Amax<T>(args.n,
-                          buffers.scalar(), args.imax_offset,
-                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          buffers[0].scalar(), args.imax_offset,
+                          buffers[0].x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -87,12 +87,12 @@ class TestXamax {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXamax<T>(args.n,
-                                   buffers.scalar, args.imax_offset,
-                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   buffers[0].scalar, args.imax_offset,
+                                   buffers[0].x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -101,15 +101,15 @@ class TestXamax {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXamax(args.n,
                  scalar_cpu, args.imax_offset,
                  x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+      buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp
index 64377189..fb2c9f1a 100644
--- a/test/routines/level1/xasum.hpp
+++ b/test/routines/level1/xasum.hpp
@@ -74,12 +74,12 @@ class TestXasum {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Asum<T>(args.n,
-                          buffers.scalar(), args.asum_offset,
-                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          buffers[0].scalar(), args.asum_offset,
+                          buffers[0].x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -87,12 +87,12 @@ class TestXasum {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXasum<T>(args.n,
-                                   buffers.scalar, args.asum_offset,
-                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   buffers[0].scalar, args.asum_offset,
+                                   buffers[0].x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -101,15 +101,15 @@ class TestXasum {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXasum(args.n,
                  scalar_cpu, args.asum_offset,
                  x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+      buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp
index eba067c0..1c74f67f 100644
--- a/test/routines/level1/xaxpy.hpp
+++ b/test/routines/level1/xaxpy.hpp
@@ -75,12 +75,12 @@ class TestXaxpy {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Axpy(args.n, args.alpha,
-                       buffers.x_vec(), args.x_offset, args.x_inc,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -88,12 +88,12 @@ class TestXaxpy {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXaxpy(args.n, args.alpha,
-                                buffers.x_vec, args.x_offset, args.x_inc,
-                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers[0].x_vec, args.x_offset, args.x_inc,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -102,15 +102,15 @@ class TestXaxpy {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXaxpy(args.n, args.alpha,
                  x_vec_cpu, args.x_offset, args.x_inc,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp
index 753f0da5..55980f30 100644
--- a/test/routines/level1/xcopy.hpp
+++ b/test/routines/level1/xcopy.hpp
@@ -74,12 +74,12 @@ class TestXcopy {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Copy<T>(args.n,
-                          buffers.x_vec(), args.x_offset, args.x_inc,
-                          buffers.y_vec(), args.y_offset, args.y_inc,
+                          buffers[0].x_vec(), args.x_offset, args.x_inc,
+                          buffers[0].y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -87,12 +87,12 @@ class TestXcopy {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXcopy<T>(args.n,
-                                   buffers.x_vec, args.x_offset, args.x_inc,
-                                   buffers.y_vec, args.y_offset, args.y_inc,
+                                   buffers[0].x_vec, args.x_offset, args.x_inc,
+                                   buffers[0].y_vec, args.y_offset, args.y_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -101,15 +101,15 @@ class TestXcopy {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXcopy(args.n,
                  x_vec_cpu, args.x_offset, args.x_inc,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp
index 8127247d..1ea69c17 100644
--- a/test/routines/level1/xdot.hpp
+++ b/test/routines/level1/xdot.hpp
@@ -78,13 +78,13 @@ class TestXdot {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Dot<T>(args.n,
-                         buffers.scalar(), args.dot_offset,
-                         buffers.x_vec(), args.x_offset, args.x_inc,
-                         buffers.y_vec(), args.y_offset, args.y_inc,
+                         buffers[0].scalar(), args.dot_offset,
+                         buffers[0].x_vec(), args.x_offset, args.x_inc,
+                         buffers[0].y_vec(), args.y_offset, args.y_inc,
                          &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -92,13 +92,13 @@ class TestXdot {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXdot<T>(args.n,
-                                  buffers.scalar, args.dot_offset,
-                                  buffers.x_vec, args.x_offset, args.x_inc,
-                                  buffers.y_vec, args.y_offset, args.y_inc,
+                                  buffers[0].scalar, args.dot_offset,
+                                  buffers[0].x_vec, args.x_offset, args.x_inc,
+                                  buffers[0].y_vec, args.y_offset, args.y_inc,
                                   1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -107,18 +107,18 @@ class TestXdot {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXdot(args.n,
                 scalar_cpu, args.dot_offset,
                 x_vec_cpu, args.x_offset, args.x_inc,
                 y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+      buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp
index 96d97dc4..00dcf7c2 100644
--- a/test/routines/level1/xdotc.hpp
+++ b/test/routines/level1/xdotc.hpp
@@ -78,13 +78,13 @@ class TestXdotc {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Dotc<T>(args.n,
-                          buffers.scalar(), args.dot_offset,
-                          buffers.x_vec(), args.x_offset, args.x_inc,
-                          buffers.y_vec(), args.y_offset, args.y_inc,
+                          buffers[0].scalar(), args.dot_offset,
+                          buffers[0].x_vec(), args.x_offset, args.x_inc,
+                          buffers[0].y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -92,13 +92,13 @@ class TestXdotc {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXdotc<T>(args.n,
-                                   buffers.scalar, args.dot_offset,
-                                   buffers.x_vec, args.x_offset, args.x_inc,
-                                   buffers.y_vec, args.y_offset, args.y_inc,
+                                   buffers[0].scalar, args.dot_offset,
+                                   buffers[0].x_vec, args.x_offset, args.x_inc,
+                                   buffers[0].y_vec, args.y_offset, args.y_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -107,18 +107,18 @@ class TestXdotc {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXdotc(args.n,
                  scalar_cpu, args.dot_offset,
                  x_vec_cpu, args.x_offset, args.x_inc,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+      buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp
index 70c7fceb..512de985 100644
--- a/test/routines/level1/xdotu.hpp
+++ b/test/routines/level1/xdotu.hpp
@@ -78,13 +78,13 @@ class TestXdotu {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Dotu<T>(args.n,
-                          buffers.scalar(), args.dot_offset,
-                          buffers.x_vec(), args.x_offset, args.x_inc,
-                          buffers.y_vec(), args.y_offset, args.y_inc,
+                          buffers[0].scalar(), args.dot_offset,
+                          buffers[0].x_vec(), args.x_offset, args.x_inc,
+                          buffers[0].y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -92,13 +92,13 @@ class TestXdotu {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXdotu<T>(args.n,
-                                   buffers.scalar, args.dot_offset,
-                                   buffers.x_vec, args.x_offset, args.x_inc,
-                                   buffers.y_vec, args.y_offset, args.y_inc,
+                                   buffers[0].scalar, args.dot_offset,
+                                   buffers[0].x_vec, args.x_offset, args.x_inc,
+                                   buffers[0].y_vec, args.y_offset, args.y_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -107,18 +107,18 @@ class TestXdotu {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXdotu(args.n,
                  scalar_cpu, args.dot_offset,
                  x_vec_cpu, args.x_offset, args.x_inc,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+      buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp
index ce33fe59..20f75226 100644
--- a/test/routines/level1/xnrm2.hpp
+++ b/test/routines/level1/xnrm2.hpp
@@ -74,12 +74,12 @@ class TestXnrm2 {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Nrm2<T>(args.n,
-                          buffers.scalar(), args.nrm2_offset,
-                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          buffers[0].scalar(), args.nrm2_offset,
+                          buffers[0].x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -87,12 +87,12 @@ class TestXnrm2 {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXnrm2<T>(args.n,
-                                   buffers.scalar, args.nrm2_offset,
-                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   buffers[0].scalar, args.nrm2_offset,
+                                   buffers[0].x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -101,15 +101,15 @@ class TestXnrm2 {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXnrm2(args.n,
                  scalar_cpu, args.nrm2_offset,
                  x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
+      buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp
index d89688b4..e2600834 100644
--- a/test/routines/level1/xscal.hpp
+++ b/test/routines/level1/xscal.hpp
@@ -71,11 +71,11 @@ class TestXscal {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Scal(args.n, args.alpha,
-                       buffers.x_vec(), args.x_offset, args.x_inc,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -83,11 +83,11 @@ class TestXscal {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXscal(args.n, args.alpha,
-                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers[0].x_vec, args.x_offset, args.x_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -96,12 +96,12 @@ class TestXscal {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXscal(args.n, args.alpha,
                  x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+      buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp
index 49b0d3d0..b9f06eb7 100644
--- a/test/routines/level1/xswap.hpp
+++ b/test/routines/level1/xswap.hpp
@@ -74,12 +74,12 @@ class TestXswap {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Swap<T>(args.n,
-                          buffers.x_vec(), args.x_offset, args.x_inc,
-                          buffers.y_vec(), args.y_offset, args.y_inc,
+                          buffers[0].x_vec(), args.x_offset, args.x_inc,
+                          buffers[0].y_vec(), args.y_offset, args.y_inc,
                           &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -87,12 +87,12 @@ class TestXswap {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXswap<T>(args.n,
-                                   buffers.x_vec, args.x_offset, args.x_inc,
-                                   buffers.y_vec, args.y_offset, args.y_inc,
+                                   buffers[0].x_vec, args.x_offset, args.x_inc,
+                                   buffers[0].y_vec, args.y_offset, args.y_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -101,16 +101,16 @@ class TestXswap {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXswap(args.n,
                  x_vec_cpu, args.x_offset, args.x_inc,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp
index f371b9a7..57c16104 100644
--- a/test/routines/level2/xgbmv.hpp
+++ b/test/routines/level2/xgbmv.hpp
@@ -86,14 +86,14 @@ class TestXgbmv {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Gbmv(args.layout, args.a_transpose,
                        args.m, args.n, args.kl, args.ku, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -101,15 +101,15 @@ class TestXgbmv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXgbmv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.a_transpose),
                                 args.m, args.n, args.kl, args.ku, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld,
-                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
+                                buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -118,20 +118,20 @@ class TestXgbmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXgbmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.a_transpose),
                  args.m, args.n, args.kl, args.ku, args.alpha,
                  a_mat_cpu, args.a_offset, args.a_ld,
                  x_vec_cpu, args.x_offset, args.x_inc, args.beta,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp
index 2442be4c..3c56c405 100644
--- a/test/routines/level2/xgemv.hpp
+++ b/test/routines/level2/xgemv.hpp
@@ -86,14 +86,14 @@ class TestXgemv {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Gemv(args.layout, args.a_transpose,
                        args.m, args.n, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -101,15 +101,15 @@ class TestXgemv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXgemv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.a_transpose),
                                 args.m, args.n, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld,
-                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
+                                buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -118,20 +118,20 @@ class TestXgemv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXgemv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.a_transpose),
                  args.m, args.n, args.alpha,
                  a_mat_cpu, args.a_offset, args.a_ld,
                  x_vec_cpu, args.x_offset, args.x_inc, args.beta,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp
index 3e7ccbc3..f9a6fefd 100644
--- a/test/routines/level2/xger.hpp
+++ b/test/routines/level2/xger.hpp
@@ -82,14 +82,14 @@ class TestXger {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Ger(args.layout,
                       args.m, args.n, args.alpha,
-                      buffers.x_vec(), args.x_offset, args.x_inc,
-                      buffers.y_vec(), args.y_offset, args.y_inc,
-                      buffers.a_mat(), args.a_offset, args.a_ld,
+                      buffers[0].x_vec(), args.x_offset, args.x_inc,
+                      buffers[0].y_vec(), args.y_offset, args.y_inc,
+                      buffers[0].a_mat(), args.a_offset, args.a_ld,
                       &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -97,14 +97,14 @@ class TestXger {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXger(convertToCLBLAS(args.layout),
                                args.m, args.n, args.alpha,
-                               buffers.x_vec, args.x_offset, args.x_inc,
-                               buffers.y_vec, args.y_offset, args.y_inc,
-                               buffers.a_mat, args.a_offset, args.a_ld,
+                               buffers[0].x_vec, args.x_offset, args.x_inc,
+                               buffers[0].y_vec, args.y_offset, args.y_inc,
+                               buffers[0].a_mat, args.a_offset, args.a_ld,
                                1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -113,19 +113,19 @@ class TestXger {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXger(convertToCBLAS(args.layout),
                 args.m, args.n, args.alpha,
                 x_vec_cpu, args.x_offset, args.x_inc,
                 y_vec_cpu, args.y_offset, args.y_inc,
                 a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+      buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp
index d880ae1f..ddc9030a 100644
--- a/test/routines/level2/xgerc.hpp
+++ b/test/routines/level2/xgerc.hpp
@@ -82,14 +82,14 @@ class TestXgerc {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Gerc(args.layout,
                        args.m, args.n, args.alpha,
-                       buffers.x_vec(), args.x_offset, args.x_inc,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -97,14 +97,14 @@ class TestXgerc {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXgerc(convertToCLBLAS(args.layout),
                                 args.m, args.n, args.alpha,
-                                buffers.x_vec, args.x_offset, args.x_inc,
-                                buffers.y_vec, args.y_offset, args.y_inc,
-                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers[0].x_vec, args.x_offset, args.x_inc,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -113,19 +113,19 @@ class TestXgerc {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXgerc(convertToCBLAS(args.layout),
                  args.m, args.n, args.alpha,
                  x_vec_cpu, args.x_offset, args.x_inc,
                  y_vec_cpu, args.y_offset, args.y_inc,
                  a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+      buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp
index 1735e42a..8d5b8589 100644
--- a/test/routines/level2/xgeru.hpp
+++ b/test/routines/level2/xgeru.hpp
@@ -82,14 +82,14 @@ class TestXgeru {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Geru(args.layout,
                        args.m, args.n, args.alpha,
-                       buffers.x_vec(), args.x_offset, args.x_inc,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -97,14 +97,14 @@ class TestXgeru {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXgeru(convertToCLBLAS(args.layout),
                                 args.m, args.n, args.alpha,
-                                buffers.x_vec, args.x_offset, args.x_inc,
-                                buffers.y_vec, args.y_offset, args.y_inc,
-                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers[0].x_vec, args.x_offset, args.x_inc,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -113,19 +113,19 @@ class TestXgeru {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXgeru(convertToCBLAS(args.layout),
                  args.m, args.n, args.alpha,
                  x_vec_cpu, args.x_offset, args.x_inc,
                  y_vec_cpu, args.y_offset, args.y_inc,
                  a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+      buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp
index 99538bf1..50130359 100644
--- a/test/routines/level2/xhbmv.hpp
+++ b/test/routines/level2/xhbmv.hpp
@@ -80,14 +80,14 @@ class TestXhbmv {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Hbmv(args.layout, args.triangle,
                        args.n, args.kl, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -95,15 +95,15 @@ class TestXhbmv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXhbmv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.kl, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld,
-                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
+                                buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -112,20 +112,20 @@ class TestXhbmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXhbmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.kl, args.alpha,
                  a_mat_cpu, args.a_offset, args.a_ld,
                  x_vec_cpu, args.x_offset, args.x_inc, args.beta,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp
index 3792cb66..f69b031c 100644
--- a/test/routines/level2/xhemv.hpp
+++ b/test/routines/level2/xhemv.hpp
@@ -80,14 +80,14 @@ class TestXhemv {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Hemv(args.layout, args.triangle,
                        args.n, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -95,15 +95,15 @@ class TestXhemv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXhemv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld,
-                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
+                                buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -112,20 +112,20 @@ class TestXhemv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXhemv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
                  a_mat_cpu, args.a_offset, args.a_ld,
                  x_vec_cpu, args.x_offset, args.x_inc, args.beta,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp
index c58eb189..c3d809bf 100644
--- a/test/routines/level2/xher.hpp
+++ b/test/routines/level2/xher.hpp
@@ -76,13 +76,13 @@ class TestXher {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Her(args.layout, args.triangle,
                       args.n, args.alpha,
-                      buffers.x_vec(), args.x_offset, args.x_inc,
-                      buffers.a_mat(), args.a_offset, args.a_ld,
+                      buffers[0].x_vec(), args.x_offset, args.x_inc,
+                      buffers[0].a_mat(), args.a_offset, args.a_ld,
                       &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -90,14 +90,14 @@ class TestXher {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXher(convertToCLBLAS(args.layout),
                                convertToCLBLAS(args.triangle),
                                args.n, args.alpha,
-                               buffers.x_vec, args.x_offset, args.x_inc,
-                               buffers.a_mat, args.a_offset, args.a_ld,
+                               buffers[0].x_vec, args.x_offset, args.x_inc,
+                               buffers[0].a_mat, args.a_offset, args.a_ld,
                                1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -106,17 +106,17 @@ class TestXher {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXher(convertToCBLAS(args.layout),
                 convertToCBLAS(args.triangle),
                 args.n, args.alpha,
                 x_vec_cpu, args.x_offset, args.x_inc,
                 a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+      buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp
index 8a7eb0b6..7ddf9ed1 100644
--- a/test/routines/level2/xher2.hpp
+++ b/test/routines/level2/xher2.hpp
@@ -80,14 +80,14 @@ class TestXher2 {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Her2(args.layout, args.triangle,
                        args.n, args.alpha,
-                       buffers.x_vec(), args.x_offset, args.x_inc,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -95,15 +95,15 @@ class TestXher2 {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXher2(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.x_vec, args.x_offset, args.x_inc,
-                                buffers.y_vec, args.y_offset, args.y_inc,
-                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers[0].x_vec, args.x_offset, args.x_inc,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -112,20 +112,20 @@ class TestXher2 {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXher2(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
                  x_vec_cpu, args.x_offset, args.x_inc,
                  y_vec_cpu, args.y_offset, args.y_inc,
                  a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+      buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp
index 0862b619..7fae80b8 100644
--- a/test/routines/level2/xhpmv.hpp
+++ b/test/routines/level2/xhpmv.hpp
@@ -80,14 +80,14 @@ class TestXhpmv {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Hpmv(args.layout, args.triangle,
                        args.n, args.alpha,
-                       buffers.ap_mat(), args.ap_offset,
-                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].ap_mat(), args.ap_offset,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -95,15 +95,15 @@ class TestXhpmv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXhpmv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.ap_mat, args.ap_offset,
-                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers[0].ap_mat, args.ap_offset,
+                                buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -112,20 +112,20 @@ class TestXhpmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXhpmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
                  ap_mat_cpu, args.ap_offset,
                  x_vec_cpu, args.x_offset, args.x_inc, args.beta,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp
index 5b454174..a46cb8e6 100644
--- a/test/routines/level2/xhpr.hpp
+++ b/test/routines/level2/xhpr.hpp
@@ -76,13 +76,13 @@ class TestXhpr {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Hpr(args.layout, args.triangle,
                       args.n, args.alpha,
-                      buffers.x_vec(), args.x_offset, args.x_inc,
-                      buffers.ap_mat(), args.ap_offset,
+                      buffers[0].x_vec(), args.x_offset, args.x_inc,
+                      buffers[0].ap_mat(), args.ap_offset,
                       &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -90,14 +90,14 @@ class TestXhpr {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXhpr(convertToCLBLAS(args.layout),
                                convertToCLBLAS(args.triangle),
                                args.n, args.alpha,
-                               buffers.x_vec, args.x_offset, args.x_inc,
-                               buffers.ap_mat, args.ap_offset,
+                               buffers[0].x_vec, args.x_offset, args.x_inc,
+                               buffers[0].ap_mat, args.ap_offset,
                                1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -106,17 +106,17 @@ class TestXhpr {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXhpr(convertToCBLAS(args.layout),
                 convertToCBLAS(args.triangle),
                 args.n, args.alpha,
                 x_vec_cpu, args.x_offset, args.x_inc,
                 ap_mat_cpu, args.ap_offset);
-      buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+      buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp
index b770da2e..08f12768 100644
--- a/test/routines/level2/xhpr2.hpp
+++ b/test/routines/level2/xhpr2.hpp
@@ -80,14 +80,14 @@ class TestXhpr2 {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Hpr2(args.layout, args.triangle,
                        args.n, args.alpha,
-                       buffers.x_vec(), args.x_offset, args.x_inc,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
-                       buffers.ap_mat(), args.ap_offset,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].ap_mat(), args.ap_offset,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -95,15 +95,15 @@ class TestXhpr2 {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXhpr2(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.x_vec, args.x_offset, args.x_inc,
-                                buffers.y_vec, args.y_offset, args.y_inc,
-                                buffers.ap_mat, args.ap_offset,
+                                buffers[0].x_vec, args.x_offset, args.x_inc,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
+                                buffers[0].ap_mat, args.ap_offset,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -112,20 +112,20 @@ class TestXhpr2 {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXhpr2(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
                  x_vec_cpu, args.x_offset, args.x_inc,
                  y_vec_cpu, args.y_offset, args.y_inc,
                  ap_mat_cpu, args.ap_offset);
-      buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+      buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp
index 7a836170..a45dbe8f 100644
--- a/test/routines/level2/xsbmv.hpp
+++ b/test/routines/level2/xsbmv.hpp
@@ -80,14 +80,14 @@ class TestXsbmv {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Sbmv(args.layout, args.triangle,
                        args.n, args.kl, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -95,15 +95,15 @@ class TestXsbmv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXsbmv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.kl, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld,
-                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
+                                buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -112,20 +112,20 @@ class TestXsbmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXsbmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.kl, args.alpha,
                  a_mat_cpu, args.a_offset, args.a_ld,
                  x_vec_cpu, args.x_offset, args.x_inc, args.beta,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp
index 352c8cfd..a455f652 100644
--- a/test/routines/level2/xspmv.hpp
+++ b/test/routines/level2/xspmv.hpp
@@ -80,14 +80,14 @@ class TestXspmv {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Spmv(args.layout, args.triangle,
                        args.n, args.alpha,
-                       buffers.ap_mat(), args.ap_offset,
-                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].ap_mat(), args.ap_offset,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -95,15 +95,15 @@ class TestXspmv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXspmv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.ap_mat, args.ap_offset,
-                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers[0].ap_mat, args.ap_offset,
+                                buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -112,20 +112,20 @@ class TestXspmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXspmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
                  ap_mat_cpu, args.ap_offset,
                  x_vec_cpu, args.x_offset, args.x_inc, args.beta,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp
index 988bcdc2..ab9ab85f 100644
--- a/test/routines/level2/xspr.hpp
+++ b/test/routines/level2/xspr.hpp
@@ -76,13 +76,13 @@ class TestXspr {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Spr(args.layout, args.triangle,
                       args.n, args.alpha,
-                      buffers.x_vec(), args.x_offset, args.x_inc,
-                      buffers.ap_mat(), args.ap_offset,
+                      buffers[0].x_vec(), args.x_offset, args.x_inc,
+                      buffers[0].ap_mat(), args.ap_offset,
                       &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -90,14 +90,14 @@ class TestXspr {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXspr(convertToCLBLAS(args.layout),
                                convertToCLBLAS(args.triangle),
                                args.n, args.alpha,
-                               buffers.x_vec, args.x_offset, args.x_inc,
-                               buffers.ap_mat, args.ap_offset,
+                               buffers[0].x_vec, args.x_offset, args.x_inc,
+                               buffers[0].ap_mat, args.ap_offset,
                                1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -106,17 +106,17 @@ class TestXspr {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXspr(convertToCBLAS(args.layout),
                 convertToCBLAS(args.triangle),
                 args.n, args.alpha,
                 x_vec_cpu, args.x_offset, args.x_inc,
                 ap_mat_cpu, args.ap_offset);
-      buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+      buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp
index ee517bc1..a73975a5 100644
--- a/test/routines/level2/xspr2.hpp
+++ b/test/routines/level2/xspr2.hpp
@@ -80,14 +80,14 @@ class TestXspr2 {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Spr2(args.layout, args.triangle,
                        args.n, args.alpha,
-                       buffers.x_vec(), args.x_offset, args.x_inc,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
-                       buffers.ap_mat(), args.ap_offset,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].ap_mat(), args.ap_offset,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -95,15 +95,15 @@ class TestXspr2 {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXspr2(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.x_vec, args.x_offset, args.x_inc,
-                                buffers.y_vec, args.y_offset, args.y_inc,
-                                buffers.ap_mat, args.ap_offset,
+                                buffers[0].x_vec, args.x_offset, args.x_inc,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
+                                buffers[0].ap_mat, args.ap_offset,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -112,20 +112,20 @@ class TestXspr2 {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXspr2(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
                  x_vec_cpu, args.x_offset, args.x_inc,
                  y_vec_cpu, args.y_offset, args.y_inc,
                  ap_mat_cpu, args.ap_offset);
-      buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
+      buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp
index 5eecfb74..c93492ed 100644
--- a/test/routines/level2/xsymv.hpp
+++ b/test/routines/level2/xsymv.hpp
@@ -80,14 +80,14 @@ class TestXsymv {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Symv(args.layout, args.triangle,
                        args.n, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -95,15 +95,15 @@ class TestXsymv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXsymv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld,
-                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
+                                buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -112,20 +112,20 @@ class TestXsymv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXsymv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
                  a_mat_cpu, args.a_offset, args.a_ld,
                  x_vec_cpu, args.x_offset, args.x_inc, args.beta,
                  y_vec_cpu, args.y_offset, args.y_inc);
-      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp
index ac4ee1ff..ac2c5e98 100644
--- a/test/routines/level2/xsyr.hpp
+++ b/test/routines/level2/xsyr.hpp
@@ -76,13 +76,13 @@ class TestXsyr {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Syr(args.layout, args.triangle,
                       args.n, args.alpha,
-                      buffers.x_vec(), args.x_offset, args.x_inc,
-                      buffers.a_mat(), args.a_offset, args.a_ld,
+                      buffers[0].x_vec(), args.x_offset, args.x_inc,
+                      buffers[0].a_mat(), args.a_offset, args.a_ld,
                       &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -90,14 +90,14 @@ class TestXsyr {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXsyr(convertToCLBLAS(args.layout),
                                convertToCLBLAS(args.triangle),
                                args.n, args.alpha,
-                               buffers.x_vec, args.x_offset, args.x_inc,
-                               buffers.a_mat, args.a_offset, args.a_ld,
+                               buffers[0].x_vec, args.x_offset, args.x_inc,
+                               buffers[0].a_mat, args.a_offset, args.a_ld,
                                1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -106,17 +106,17 @@ class TestXsyr {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXsyr(convertToCBLAS(args.layout),
                 convertToCBLAS(args.triangle),
                 args.n, args.alpha,
                 x_vec_cpu, args.x_offset, args.x_inc,
                 a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+      buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp
index 43644883..9f8d315b 100644
--- a/test/routines/level2/xsyr2.hpp
+++ b/test/routines/level2/xsyr2.hpp
@@ -80,14 +80,14 @@ class TestXsyr2 {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Syr2(args.layout, args.triangle,
                        args.n, args.alpha,
-                       buffers.x_vec(), args.x_offset, args.x_inc,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].x_vec(), args.x_offset, args.x_inc,
+                       buffers[0].y_vec(), args.y_offset, args.y_inc,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -95,15 +95,15 @@ class TestXsyr2 {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXsyr2(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.x_vec, args.x_offset, args.x_inc,
-                                buffers.y_vec, args.y_offset, args.y_inc,
-                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers[0].x_vec, args.x_offset, args.x_inc,
+                                buffers[0].y_vec, args.y_offset, args.y_inc,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -112,20 +112,20 @@ class TestXsyr2 {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
-      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
       cblasXsyr2(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  args.n, args.alpha,
                  x_vec_cpu, args.x_offset, args.x_inc,
                  y_vec_cpu, args.y_offset, args.y_inc,
                  a_mat_cpu, args.a_offset, args.a_ld);
-      buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+      buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp
index ab9244af..2d964fda 100644
--- a/test/routines/level2/xtbmv.hpp
+++ b/test/routines/level2/xtbmv.hpp
@@ -75,13 +75,13 @@ class TestXtbmv {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
                           args.n, args.kl,
-                          buffers.a_mat(), args.a_offset, args.a_ld,
-                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          buffers[0].a_mat(), args.a_offset, args.a_ld,
+                          buffers[0].x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -89,7 +89,7 @@ class TestXtbmv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXtbmv<T>(convertToCLBLAS(args.layout),
@@ -97,8 +97,8 @@ class TestXtbmv {
                                    convertToCLBLAS(args.a_transpose),
                                    convertToCLBLAS(args.diagonal),
                                    args.n, args.kl,
-                                   buffers.a_mat, args.a_offset, args.a_ld,
-                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   buffers[0].a_mat, args.a_offset, args.a_ld,
+                                   buffers[0].x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -107,11 +107,11 @@ class TestXtbmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXtbmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
@@ -119,7 +119,7 @@ class TestXtbmv {
                  args.n, args.kl,
                  a_mat_cpu, args.a_offset, args.a_ld,
                  x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+      buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp
index 3821e1a4..fcfd86bf 100644
--- a/test/routines/level2/xtpmv.hpp
+++ b/test/routines/level2/xtpmv.hpp
@@ -75,13 +75,13 @@ class TestXtpmv {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
                           args.n,
-                          buffers.ap_mat(), args.ap_offset,
-                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          buffers[0].ap_mat(), args.ap_offset,
+                          buffers[0].x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -89,7 +89,7 @@ class TestXtpmv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXtpmv<T>(convertToCLBLAS(args.layout),
@@ -97,8 +97,8 @@ class TestXtpmv {
                                    convertToCLBLAS(args.a_transpose),
                                    convertToCLBLAS(args.diagonal),
                                    args.n,
-                                   buffers.ap_mat, args.ap_offset,
-                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   buffers[0].ap_mat, args.ap_offset,
+                                   buffers[0].x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -107,11 +107,11 @@ class TestXtpmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXtpmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
@@ -119,7 +119,7 @@ class TestXtpmv {
                  args.n,
                  ap_mat_cpu, args.ap_offset,
                  x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+      buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp
index 7211c757..4e209584 100644
--- a/test/routines/level2/xtrmv.hpp
+++ b/test/routines/level2/xtrmv.hpp
@@ -75,13 +75,13 @@ class TestXtrmv {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
                           args.n,
-                          buffers.a_mat(), args.a_offset, args.a_ld,
-                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          buffers[0].a_mat(), args.a_offset, args.a_ld,
+                          buffers[0].x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -89,7 +89,7 @@ class TestXtrmv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXtrmv<T>(convertToCLBLAS(args.layout),
@@ -97,8 +97,8 @@ class TestXtrmv {
                                    convertToCLBLAS(args.a_transpose),
                                    convertToCLBLAS(args.diagonal),
                                    args.n,
-                                   buffers.a_mat, args.a_offset, args.a_ld,
-                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   buffers[0].a_mat, args.a_offset, args.a_ld,
+                                   buffers[0].x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -107,11 +107,11 @@ class TestXtrmv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXtrmv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
@@ -119,7 +119,7 @@ class TestXtrmv {
                  args.n,
                  a_mat_cpu, args.a_offset, args.a_ld,
                  x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+      buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp
index 78b9672f..090684b1 100644
--- a/test/routines/level2/xtrsv.hpp
+++ b/test/routines/level2/xtrsv.hpp
@@ -90,13 +90,13 @@ class TestXtrsv {
   }
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
                           args.n,
-                          buffers.a_mat(), args.a_offset, args.a_ld,
-                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          buffers[0].a_mat(), args.a_offset, args.a_ld,
+                          buffers[0].x_vec(), args.x_offset, args.x_inc,
                           &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -104,7 +104,7 @@ class TestXtrsv {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXtrsv<T>(convertToCLBLAS(args.layout),
@@ -112,8 +112,8 @@ class TestXtrsv {
                                    convertToCLBLAS(args.a_transpose),
                                    convertToCLBLAS(args.diagonal),
                                    args.n,
-                                   buffers.a_mat, args.a_offset, args.a_ld,
-                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   buffers[0].a_mat, args.a_offset, args.a_ld,
+                                   buffers[0].x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -122,11 +122,11 @@ class TestXtrsv {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
       cblasXtrsv(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
@@ -134,7 +134,7 @@ class TestXtrsv {
                  args.n,
                  a_mat_cpu, args.a_offset, args.a_ld,
                  x_vec_cpu, args.x_offset, args.x_inc);
-      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+      buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp
index 1b12fb1c..5b220889 100644
--- a/test/routines/level3/xgemm.hpp
+++ b/test/routines/level3/xgemm.hpp
@@ -88,14 +88,14 @@ class TestXgemm {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
                        args.m, args.n, args.k, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta,
+                       buffers[0].c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -103,16 +103,16 @@ class TestXgemm {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXgemm(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.a_transpose),
                                 convertToCLBLAS(args.b_transpose),
                                 args.m, args.n, args.k, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld,
-                                buffers.b_mat, args.b_offset, args.b_ld, args.beta,
-                                buffers.c_mat, args.c_offset, args.c_ld,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
+                                buffers[0].b_mat, args.b_offset, args.b_ld, args.beta,
+                                buffers[0].c_mat, args.c_offset, args.c_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -121,13 +121,13 @@ class TestXgemm {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
       std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
+      buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
       cblasXgemm(convertToCBLAS(args.layout),
                  convertToCBLAS(args.a_transpose),
                  convertToCBLAS(args.b_transpose),
@@ -135,7 +135,7 @@ class TestXgemm {
                  a_mat_cpu, args.a_offset, args.a_ld,
                  b_mat_cpu, args.b_offset, args.b_ld, args.beta,
                  c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+      buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp
index 76550b15..e6e8724f 100644
--- a/test/routines/level3/xhemm.hpp
+++ b/test/routines/level3/xhemm.hpp
@@ -88,14 +88,14 @@ class TestXhemm {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Hemm(args.layout, args.side, args.triangle,
                        args.m, args.n, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta,
+                       buffers[0].c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -103,16 +103,16 @@ class TestXhemm {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXhemm(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.side),
                                 convertToCLBLAS(args.triangle),
                                 args.m, args.n, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld,
-                                buffers.b_mat, args.b_offset, args.b_ld, args.beta,
-                                buffers.c_mat, args.c_offset, args.c_ld,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
+                                buffers[0].b_mat, args.b_offset, args.b_ld, args.beta,
+                                buffers[0].c_mat, args.c_offset, args.c_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -121,13 +121,13 @@ class TestXhemm {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
       std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
+      buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
       cblasXhemm(convertToCBLAS(args.layout),
                  convertToCBLAS(args.side),
                  convertToCBLAS(args.triangle),
@@ -135,7 +135,7 @@ class TestXhemm {
                  a_mat_cpu, args.a_offset, args.a_ld,
                  b_mat_cpu, args.b_offset, args.b_ld, args.beta,
                  c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+      buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp
index 5ca3aac6..749eca11 100644
--- a/test/routines/level3/xher2k.hpp
+++ b/test/routines/level3/xher2k.hpp
@@ -86,15 +86,15 @@ class TestXher2k {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto alpha2 = T{args.alpha, args.alpha};
     auto status = Her2k(args.layout, args.triangle, args.a_transpose,
                         args.n, args.k, alpha2,
-                        buffers.a_mat(), args.a_offset, args.a_ld,
-                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                        buffers.c_mat(), args.c_offset, args.c_ld,
+                        buffers[0].a_mat(), args.a_offset, args.a_ld,
+                        buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta,
+                        buffers[0].c_mat(), args.c_offset, args.c_ld,
                         &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -102,7 +102,7 @@ class TestXher2k {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto alpha2 = T{args.alpha, args.alpha};
@@ -110,9 +110,9 @@ class TestXher2k {
                                  convertToCLBLAS(args.triangle),
                                  convertToCLBLAS(args.a_transpose),
                                  args.n, args.k, alpha2,
-                                 buffers.a_mat, args.a_offset, args.a_ld,
-                                 buffers.b_mat, args.b_offset, args.b_ld, args.beta,
-                                 buffers.c_mat, args.c_offset, args.c_ld,
+                                 buffers[0].a_mat, args.a_offset, args.a_ld,
+                                 buffers[0].b_mat, args.b_offset, args.b_ld, args.beta,
+                                 buffers[0].c_mat, args.c_offset, args.c_ld,
                                  1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -121,13 +121,13 @@ class TestXher2k {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
       std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
+      buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
       auto alpha2 = T{args.alpha, args.alpha};
       cblasXher2k(convertToCBLAS(args.layout),
                   convertToCBLAS(args.triangle),
@@ -136,7 +136,7 @@ class TestXher2k {
                   a_mat_cpu, args.a_offset, args.a_ld,
                   b_mat_cpu, args.b_offset, args.b_ld, args.beta,
                   c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+      buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp
index e93d887a..e9193847 100644
--- a/test/routines/level3/xherk.hpp
+++ b/test/routines/level3/xherk.hpp
@@ -79,13 +79,13 @@ class TestXherk {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Herk(args.layout, args.triangle, args.a_transpose,
                        args.n, args.k, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
-                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld, args.beta,
+                       buffers[0].c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -93,15 +93,15 @@ class TestXherk {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXherk(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 convertToCLBLAS(args.a_transpose),
                                 args.n, args.k, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld, args.beta,
-                                buffers.c_mat, args.c_offset, args.c_ld,
+                                buffers[0].a_mat, args.a_offset, args.a_ld, args.beta,
+                                buffers[0].c_mat, args.c_offset, args.c_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -110,18 +110,18 @@ class TestXherk {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
       cblasXherk(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
                  args.n, args.k, args.alpha,
                  a_mat_cpu, args.a_offset, args.a_ld, args.beta,
                  c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+      buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp
index 9d127e26..bcd74fda 100644
--- a/test/routines/level3/xsymm.hpp
+++ b/test/routines/level3/xsymm.hpp
@@ -88,14 +88,14 @@ class TestXsymm {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Symm(args.layout, args.side, args.triangle,
                        args.m, args.n, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta,
+                       buffers[0].c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -103,16 +103,16 @@ class TestXsymm {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXsymm(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.side),
                                 convertToCLBLAS(args.triangle),
                                 args.m, args.n, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld,
-                                buffers.b_mat, args.b_offset, args.b_ld, args.beta,
-                                buffers.c_mat, args.c_offset, args.c_ld,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
+                                buffers[0].b_mat, args.b_offset, args.b_ld, args.beta,
+                                buffers[0].c_mat, args.c_offset, args.c_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -121,13 +121,13 @@ class TestXsymm {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
       std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
+      buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
       cblasXsymm(convertToCBLAS(args.layout),
                  convertToCBLAS(args.side),
                  convertToCBLAS(args.triangle),
@@ -135,7 +135,7 @@ class TestXsymm {
                  a_mat_cpu, args.a_offset, args.a_ld,
                  b_mat_cpu, args.b_offset, args.b_ld, args.beta,
                  c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+      buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp
index d1bdac56..c722e0cf 100644
--- a/test/routines/level3/xsyr2k.hpp
+++ b/test/routines/level3/xsyr2k.hpp
@@ -86,14 +86,14 @@ class TestXsyr2k {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
                         args.n, args.k, args.alpha,
-                        buffers.a_mat(), args.a_offset, args.a_ld,
-                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                        buffers.c_mat(), args.c_offset, args.c_ld,
+                        buffers[0].a_mat(), args.a_offset, args.a_ld,
+                        buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta,
+                        buffers[0].c_mat(), args.c_offset, args.c_ld,
                         &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -101,16 +101,16 @@ class TestXsyr2k {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXsyr2k(convertToCLBLAS(args.layout),
                                  convertToCLBLAS(args.triangle),
                                  convertToCLBLAS(args.a_transpose),
                                  args.n, args.k, args.alpha,
-                                 buffers.a_mat, args.a_offset, args.a_ld,
-                                 buffers.b_mat, args.b_offset, args.b_ld, args.beta,
-                                 buffers.c_mat, args.c_offset, args.c_ld,
+                                 buffers[0].a_mat, args.a_offset, args.a_ld,
+                                 buffers[0].b_mat, args.b_offset, args.b_ld, args.beta,
+                                 buffers[0].c_mat, args.c_offset, args.c_ld,
                                  1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -119,13 +119,13 @@ class TestXsyr2k {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
       std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
+      buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
       cblasXsyr2k(convertToCBLAS(args.layout),
                   convertToCBLAS(args.triangle),
                   convertToCBLAS(args.a_transpose),
@@ -133,7 +133,7 @@ class TestXsyr2k {
                   a_mat_cpu, args.a_offset, args.a_ld,
                   b_mat_cpu, args.b_offset, args.b_ld, args.beta,
                   c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+      buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp
index 1330924e..7d5c2039 100644
--- a/test/routines/level3/xsyrk.hpp
+++ b/test/routines/level3/xsyrk.hpp
@@ -79,13 +79,13 @@ class TestXsyrk {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Syrk(args.layout, args.triangle, args.a_transpose,
                        args.n, args.k, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
-                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld, args.beta,
+                       buffers[0].c_mat(), args.c_offset, args.c_ld,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -93,15 +93,15 @@ class TestXsyrk {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXsyrk(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 convertToCLBLAS(args.a_transpose),
                                 args.n, args.k, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld, args.beta,
-                                buffers.c_mat, args.c_offset, args.c_ld,
+                                buffers[0].a_mat, args.a_offset, args.a_ld, args.beta,
+                                buffers[0].c_mat, args.c_offset, args.c_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -110,18 +110,18 @@ class TestXsyrk {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
       cblasXsyrk(convertToCBLAS(args.layout),
                  convertToCBLAS(args.triangle),
                  convertToCBLAS(args.a_transpose),
                  args.n, args.k, args.alpha,
                  a_mat_cpu, args.a_offset, args.a_ld, args.beta,
                  c_mat_cpu, args.c_offset, args.c_ld);
-      buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
+      buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp
index 7c5bd842..50cca6f8 100644
--- a/test/routines/level3/xtrmm.hpp
+++ b/test/routines/level3/xtrmm.hpp
@@ -79,13 +79,13 @@ class TestXtrmm {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
                        args.m, args.n, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.b_mat(), args.b_offset, args.b_ld,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].b_mat(), args.b_offset, args.b_ld,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -93,7 +93,7 @@ class TestXtrmm {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXtrmm(convertToCLBLAS(args.layout),
@@ -102,8 +102,8 @@ class TestXtrmm {
                                 convertToCLBLAS(args.a_transpose),
                                 convertToCLBLAS(args.diagonal),
                                 args.m, args.n, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld,
-                                buffers.b_mat, args.b_offset, args.b_ld,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
+                                buffers[0].b_mat, args.b_offset, args.b_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -112,11 +112,11 @@ class TestXtrmm {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
       cblasXtrmm(convertToCBLAS(args.layout),
                  convertToCBLAS(args.side),
                  convertToCBLAS(args.triangle),
@@ -125,7 +125,7 @@ class TestXtrmm {
                  args.m, args.n, args.alpha,
                  a_mat_cpu, args.a_offset, args.a_ld,
                  b_mat_cpu, args.b_offset, args.b_ld);
-      buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
+      buffers[0].b_mat.Write(queue, args.b_size, b_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp
index a70ef03f..91f91d0b 100644
--- a/test/routines/level3/xtrsm.hpp
+++ b/test/routines/level3/xtrsm.hpp
@@ -91,13 +91,13 @@ class TestXtrsm {
   }
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
                        args.m, args.n, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.b_mat(), args.b_offset, args.b_ld,
+                       buffers[0].a_mat(), args.a_offset, args.a_ld,
+                       buffers[0].b_mat(), args.b_offset, args.b_ld,
                        &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -105,7 +105,7 @@ class TestXtrsm {
 
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
-    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXtrsm(convertToCLBLAS(args.layout),
@@ -114,8 +114,8 @@ class TestXtrsm {
                                 convertToCLBLAS(args.a_transpose),
                                 convertToCLBLAS(args.diagonal),
                                 args.m, args.n, args.alpha,
-                                buffers.a_mat, args.a_offset, args.a_ld,
-                                buffers.b_mat, args.b_offset, args.b_ld,
+                                buffers[0].a_mat, args.a_offset, args.a_ld,
+                                buffers[0].b_mat, args.b_offset, args.b_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
@@ -124,11 +124,11 @@ class TestXtrsm {
 
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
-    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
-      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
-      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+      buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
       cblasXtrsm(convertToCBLAS(args.layout),
                  convertToCBLAS(args.side),
                  convertToCBLAS(args.triangle),
@@ -137,7 +137,7 @@ class TestXtrsm {
                  args.m, args.n, args.alpha,
                  a_mat_cpu, args.a_offset, args.a_ld,
                  b_mat_cpu, args.b_offset, args.b_ld);
-      buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
+      buffers[0].b_mat.Write(queue, args.b_size, b_mat_cpu);
       return StatusCode::kSuccess;
     }
   #endif
diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp
index b470dbf3..2cb1b2ce 100644
--- a/test/routines/levelx/xinvert.hpp
+++ b/test/routines/levelx/xinvert.hpp
@@ -173,14 +173,14 @@ class TestXinvert {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     try {
       auto event = cl_event{};
       auto inverter = Xinvert<T>(queue, &event);
       inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal,
                                           args.n, args.m,
-                                          buffers.a_mat, args.a_offset, args.a_ld,
-                                          buffers.b_mat);
+                                          buffers[0].a_mat, args.a_offset, args.a_ld,
+                                          buffers[0].b_mat);
       clWaitForEvents(1, &event);
       clReleaseEvent(event);
     } catch (...) { return DispatchException(); }
@@ -189,12 +189,12 @@ class TestXinvert {
 
   // Describes how to run a naive version of the routine (for correctness/performance comparison).
   // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
-  static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-    return RunReference(args, buffers, queue);
+  static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
+    return RunReference(args, buffers[0], queue);
   }
 
-  static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-    return RunReference(args, buffers, queue);
+  static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
+    return RunReference(args, buffers[0], queue);
   }
 
   // Describes how to download the results of the computation (more importantly: which buffer)
diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp
index d1064d0c..69f0b2b6 100644
--- a/test/routines/levelx/xomatcopy.hpp
+++ b/test/routines/levelx/xomatcopy.hpp
@@ -133,13 +133,13 @@ class TestXomatcopy {
                           std::vector<T>&, std::vector<T>&) {} // N/A for this routine
 
   // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Omatcopy<T>(args.layout, args.a_transpose,
                               args.m, args.n, args.alpha,
-                              buffers.a_mat(), args.a_offset, args.a_ld,
-                              buffers.b_mat(), args.b_offset, args.b_ld,
+                              buffers[0].a_mat(), args.a_offset, args.a_ld,
+                              buffers[0].b_mat(), args.b_offset, args.b_ld,
                               &queue_plain, &event);
     if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
     return status;
@@ -147,12 +147,12 @@ class TestXomatcopy {
 
   // Describes how to run a naive version of the routine (for correctness/performance comparison).
   // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
-  static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-    return RunReference(args, buffers, queue);
+  static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
+    return RunReference(args, buffers[0], queue);
   }
 
-  static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
-    return RunReference(args, buffers, queue);
+  static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
+    return RunReference(args, buffers[0], queue);
   }
 
   // Describes how to download the results of the computation (more importantly: which buffer)