9 files changed, 296 insertions, 34 deletions
diff --git a/test/correctness/misc/override_parameters.cpp b/test/correctness/misc/override_parameters.cpp
index a4cecf0d..e6eebef7 100644
--- a/test/correctness/misc/override_parameters.cpp
+++ b/test/correctness/misc/override_parameters.cpp
@@ -11,11 +11,14 @@
 //
 // =================================================================================================
 
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <random>
+
 #include "utilities/utilities.hpp"
 #include "test/routines/level3/xgemm.hpp"
 
-#include <unordered_map>
-
 namespace clblast {
 // =================================================================================================
 
@@ -71,9 +74,11 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st
   auto host_a = std::vector<T>(args.m * args.k);
   auto host_b = std::vector<T>(args.n * args.k);
   auto host_c = std::vector<T>(args.m * args.n);
-  PopulateVector(host_a, kSeed);
-  PopulateVector(host_b, kSeed);
-  PopulateVector(host_c, kSeed);
+  std::mt19937 mt(kSeed);
+  std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
+  PopulateVector(host_a, mt, dist);
+  PopulateVector(host_b, mt, dist);
+  PopulateVector(host_c, mt, dist);
 
   // Copy the matrices to the device
   auto device_a = Buffer<T>(context, host_a.size());
diff --git a/test/correctness/routines/levelx/xaxpybatched.cpp b/test/correctness/routines/levelx/xaxpybatched.cpp
new file mode 100644
index 00000000..a106440f
--- /dev/null
+++ b/test/correctness/routines/levelx/xaxpybatched.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/levelx/xaxpybatched.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXaxpyBatched<float>, float, float>(argc, argv, false, "SAXPYBATCHED");
+  errors += clblast::RunTests<clblast::TestXaxpyBatched<double>, double, double>(argc, argv, true, "DAXPYBATCHED");
+  errors += clblast::RunTests<clblast::TestXaxpyBatched<float2>, float2, float2>(argc, argv, true, "CAXPYBATCHED");
+  errors += clblast::RunTests<clblast::TestXaxpyBatched<double2>, double2, double2>(argc, argv, true, "ZAXPYBATCHED");
+  errors += clblast::RunTests<clblast::TestXaxpyBatched<half>, half, half>(argc, argv, true, "HAXPYBATCHED");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index d959ce18..56376d0b 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -13,7 +13,9 @@
 
 #include <algorithm>
 #include <iostream>
+#include <random>
 
+#include "utilities/utilities.hpp"
 #include "test/correctness/testblas.hpp"
 
 namespace clblast {
@@ -25,6 +27,7 @@ template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kIncr
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixDims = { 7, 64 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 256 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBandSizes = { 4, 19 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBatchCounts = { 1, 3 };
 
 // Test settings for the invalid tests
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kInvalidIncrements = { 0, 1 };
@@ -79,22 +82,25 @@ TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool si
   const auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
   const auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
   const auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
+  const auto max_batch_count = *std::max_element(kBatchCounts.begin(), kBatchCounts.end());
 
   // Creates test input data
-  x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
-  y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
-  a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
-  scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset);
-  PopulateVector(x_source_, kSeed);
-  PopulateVector(y_source_, kSeed);
-  PopulateVector(a_source_, kSeed);
-  PopulateVector(b_source_, kSeed);
-  PopulateVector(c_source_, kSeed);
-  PopulateVector(ap_source_, kSeed);
-  PopulateVector(scalar_source_, kSeed);
+  x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
+  y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
+  a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
+  scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset);
+  std::mt19937 mt(kSeed);
+  std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
+  PopulateVector(x_source_, mt, dist);
+  PopulateVector(y_source_, mt, dist);
+  PopulateVector(a_source_, mt, dist);
+  PopulateVector(b_source_, mt, dist);
+  PopulateVector(c_source_, mt, dist);
+  PopulateVector(ap_source_, mt, dist);
+  PopulateVector(scalar_source_, mt, dist);
 }
 
 // ===============================================================================================
@@ -190,15 +196,15 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     auto result2 = get_result_(args, buffers2, queue_);
 
     // Computes the L2 error
-    const auto kErrorMarginL2 = getL2ErrorMargin<T>();
     auto l2error = 0.0;
+    const auto kErrorMarginL2 = getL2ErrorMargin<T>();
     for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
       for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
         auto index = get_index_(args, id1, id2);
         l2error += SquaredDifference(result1[index], result2[index]);
       }
     }
-    l2error /= (get_id1_(args) * get_id2_(args));
+    l2error /= static_cast<double>(get_id1_(args) * get_id2_(args));
 
     // Checks for differences in the output
     auto errors = size_t{0};
@@ -219,8 +225,10 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
         }
       }
     }
+
+    // Report the results
     if (verbose_ && errors > 0) {
-      fprintf(stdout, "\n   Combined L2 error: %.2e\n   ", l2error);
+      fprintf(stdout, "\n   Combined average L2 error: %.2e\n   ", l2error);
     }
 
     // Tests the error count (should be zero)
diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp
index ee795aad..42e8aef7 100644
--- a/test/correctness/testblas.hpp
+++ b/test/correctness/testblas.hpp
@@ -56,6 +56,7 @@ class TestBlas: public Tester<T,U> {
   static const std::vector<size_t> kMatrixDims;
   static const std::vector<size_t> kMatrixVectorDims;
   static const std::vector<size_t> kBandSizes;
+  static const std::vector<size_t> kBatchCounts;
   const std::vector<size_t> kOffsets;
   const std::vector<U> kAlphaValues;
   const std::vector<U> kBetaValues;
@@ -183,6 +184,7 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
   auto imax_offsets = std::vector<size_t>{args.imax_offset};
   auto alphas = std::vector<U>{args.alpha};
   auto betas = std::vector<U>{args.beta};
+  auto batch_counts = std::vector<size_t>{args.batch_count};
   auto x_sizes = std::vector<size_t>{args.x_size};
   auto y_sizes = std::vector<size_t>{args.y_size};
   auto a_sizes = std::vector<size_t>{args.a_size};
@@ -226,6 +228,7 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
     if (option == kArgImaxOffset) { imax_offsets = tester.kOffsets; }
     if (option == kArgAlpha) { alphas = tester.kAlphaValues; }
     if (option == kArgBeta) { betas = tester.kBetaValues; }
+    if (option == kArgBatchCount) { batch_counts = tester.kBatchCounts; }
 
     if (option == kArgXOffset) { x_sizes = tester.kVecSizes; }
     if (option == kArgYOffset) { y_sizes = tester.kVecSizes; }
@@ -268,8 +271,10 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
                                                     for (auto &imax_offset: imax_offsets) { r_args.imax_offset = imax_offset;
                                                       for (auto &alpha: alphas) { r_args.alpha = alpha;
                                                         for (auto &beta: betas) { r_args.beta = beta;
-                                                          C::SetSizes(r_args);
-                                                          regular_test_vector.push_back(r_args);
+                                                          for (auto &batch_count: batch_counts) { r_args.batch_count = batch_count;
+                                                            C::SetSizes(r_args);
+                                                            regular_test_vector.push_back(r_args);
+                                                          }
                                                         }
                                                       }
                                                     }
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index cbfc5bb2..40784fdb 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -367,6 +367,7 @@ std::string Tester<T,U>::GetOptionsString(const Arguments<U> &args) {
     if (o == kArgDotOffset){ result += kArgDotOffset + equals + ToString(args.dot_offset) + " "; }
     if (o == kArgAlpha)    { result += kArgAlpha + equals + ToString(args.alpha) + " "; }
     if (o == kArgBeta)     { result += kArgBeta + equals + ToString(args.beta) + " "; }
+    if (o == kArgBatchCount){result += kArgBatchCount + equals + ToString(args.batch_count) + " "; }
   }
   return result;
 }
diff --git a/test/performance/client.cpp b/test/performance/client.cpp
index 2c45b35e..bd48b047 100644
--- a/test/performance/client.cpp
+++ b/test/performance/client.cpp
@@ -11,13 +11,15 @@
 //
 // =================================================================================================
 
-#include "test/performance/client.hpp"
-
 #include <string>
 #include <vector>
 #include <utility>
 #include <algorithm>
 #include <chrono>
+#include <random>
+
+#include "utilities/utilities.hpp"
+#include "test/performance/client.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -89,6 +91,9 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le
     if (o == kArgAsumOffset)  { args.asum_offset = GetArgument(command_line_args, help, kArgAsumOffset, size_t{0}); }
     if (o == kArgImaxOffset)  { args.imax_offset = GetArgument(command_line_args, help, kArgImaxOffset, size_t{0}); }
 
+    // Batch arguments
+    if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, size_t{1}); }
+
     // Scalar values 
     if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar<U>()); }
     if (o == kArgBeta)  { args.beta  = GetArgument(command_line_args, help, kArgBeta, GetScalar<U>()); }
@@ -179,13 +184,15 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
     std::vector<T> c_source(args.c_size);
     std::vector<T> ap_source(args.ap_size);
     std::vector<T> scalar_source(args.scalar_size);
-    PopulateVector(x_source, kSeed);
-    PopulateVector(y_source, kSeed);
-    PopulateVector(a_source, kSeed);
-    PopulateVector(b_source, kSeed);
-    PopulateVector(c_source, kSeed);
-    PopulateVector(ap_source, kSeed);
-    PopulateVector(scalar_source, kSeed);
+    std::mt19937 mt(kSeed);
+    std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
+    PopulateVector(x_source, mt, dist);
+    PopulateVector(y_source, mt, dist);
+    PopulateVector(a_source, mt, dist);
+    PopulateVector(b_source, mt, dist);
+    PopulateVector(c_source, mt, dist);
+    PopulateVector(ap_source, mt, dist);
+    PopulateVector(scalar_source, mt, dist);
 
     // Creates the matrices on the device
     auto x_vec = Buffer<T>(context, args.x_size);
@@ -335,6 +342,7 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args,
     else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); }
     else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); }
     else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); }
+    else if (o == kArgBatchCount){integers.push_back(args.batch_count); }
   }
   auto strings = std::vector<std::string>{};
   for (auto &o: options_) {
diff --git a/test/performance/routines/levelx/xaxpybatched.cpp b/test/performance/routines/levelx/xaxpybatched.cpp
new file mode 100644
index 00000000..6d3bcb51
--- /dev/null
+++ b/test/performance/routines/levelx/xaxpybatched.cpp
@@ -0,0 +1,37 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/performance/client.hpp"
+#include "test/routines/levelx/xaxpybatched.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
+  switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXaxpyBatched<half>, half, half>(argc, argv); break;
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXaxpyBatched<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXaxpyBatched<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXaxpyBatched<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXaxpyBatched<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp
new file mode 100644
index 00000000..ee15ff92
--- /dev/null
+++ b/test/routines/levelx/xaxpybatched.hpp
@@ -0,0 +1,168 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the XaxpyBatched routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+#define CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+
+#include <vector>
+#include <string>
+
+#include "utilities/utilities.hpp"
+
+#ifdef CLBLAST_REF_CLBLAS
+  #include "test/wrapper_clblas.hpp"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+  #include "test/wrapper_cblas.hpp"
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXaxpyBatched {
+ public:
+
+  // Although it is a non-BLAS routine, it can still be tested against level-1 routines in a loop
+  static size_t BLASLevel() { return 1; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgXInc, kArgYInc,
+            kArgBatchCount, kArgAlpha};
+  }
+
+  // Helper for the sizes per batch
+  static size_t PerBatchSizeX(const Arguments<T> &args) { return args.n * args.x_inc; }
+  static size_t PerBatchSizeY(const Arguments<T> &args) { return args.n * args.y_inc; }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return PerBatchSizeX(args) * args.batch_count + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    return PerBatchSizeY(args) * args.batch_count + args.y_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+
+    // Also sets the batch-related variables
+    args.x_offsets = std::vector<size_t>(args.batch_count);
+    args.y_offsets = std::vector<size_t>(args.batch_count);
+    args.alphas = std::vector<T>(args.batch_count);
+    for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+      args.x_offsets[batch] = batch * PerBatchSizeX(args) + args.x_offset;
+      args.y_offsets[batch] = batch * PerBatchSizeY(args) + args.y_offset;
+      args.alphas[batch] = args.alpha + Constant<T>(batch);
+    }
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = AxpyBatched(args.n, args.alphas.data(),
+                              buffers.x_vec(), args.x_offsets.data(), args.x_inc,
+                              buffers.y_vec(), args.y_offsets.data(), args.y_inc,
+                              args.batch_count,
+                              &queue_plain, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CLBLAS
+    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      auto queue_plain = queue();
+      for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+        auto event = cl_event{};
+        auto status = clblasXaxpy(args.n, args.alphas[batch],
+                                  buffers.x_vec, args.x_offsets[batch], args.x_inc,
+                                  buffers.y_vec, args.y_offsets[batch], args.y_inc,
+                                  1, &queue_plain, 0, nullptr, &event);
+        clWaitForEvents(1, &event);
+        if (static_cast<StatusCode>(status) != StatusCode::kSuccess) {
+          return static_cast<StatusCode>(status);
+        }
+      }
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CBLAS
+    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+      std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
+      for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+        cblasXaxpy(args.n, args.alphas[batch],
+                   x_vec_cpu, args.x_offsets[batch], args.x_inc,
+                   y_vec_cpu, args.y_offsets[batch], args.y_inc);
+      }
+      buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to download the results of the computation
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.Read(queue, args.y_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.batch_count; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (id1 * args.y_inc) + args.y_offsets[id2];
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return args.batch_count * (2 * args.n);
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return args.batch_count * (3 * args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+#endif
diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp
index 05bea9aa..b470dbf3 100644
--- a/test/routines/levelx/xinvert.hpp
+++ b/test/routines/levelx/xinvert.hpp
@@ -19,7 +19,7 @@
 #include <vector>
 #include <string>
 
-#include "routines/levelx/xinvert.hpp"
+#include "utilities/utilities.hpp"
 
 namespace clblast {
 // =================================================================================================