9 files changed, 359 insertions, 40 deletions
diff --git a/src/clblast.cpp b/src/clblast.cpp
index a63d766c..d3db8edf 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -71,6 +71,7 @@
 
 // Level-x includes (non-BLAS)
 #include "routines/levelx/xomatcopy.hpp"
+#include "routines/levelx/xaxpybatched.hpp"
 
 namespace clblast {
 
@@ -2172,6 +2173,64 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
                                               const cl_mem, const size_t, const size_t,
                                               cl_mem, const size_t, const size_t,
                                               cl_command_queue*, cl_event*);
+
+// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+template <typename T>
+StatusCode AxpyBatched(const size_t n,
+                       const T *alphas,
+                       const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                       cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = XaxpyBatched<T>(queue_cpp, event);
+    auto alphas_cpp = std::vector<T>();
+    auto x_offsets_cpp = std::vector<size_t>();
+    auto y_offsets_cpp = std::vector<size_t>();
+    for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+      alphas_cpp.push_back(alphas[batch]);
+      x_offsets_cpp.push_back(x_offsets[batch]);
+      y_offsets_cpp.push_back(y_offsets[batch]);
+    }
+    routine.DoAxpyBatched(n,
+                          alphas_cpp,
+                          Buffer<T>(x_buffer), x_offsets_cpp, x_inc,
+                          Buffer<T>(y_buffer), y_offsets_cpp, y_inc,
+                          batch_count);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API AxpyBatched<float>(const size_t,
+                                                  const float*,
+                                                  const cl_mem, const size_t*, const size_t,
+                                                  cl_mem, const size_t*, const size_t,
+                                                  const size_t,
+                                                  cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<double>(const size_t,
+                                                   const double*,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   cl_mem, const size_t*, const size_t,
+                                                   const size_t,
+                                                   cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t,
+                                                   const float2*,
+                                                   const cl_mem, const size_t*, const size_t,
+                                                   cl_mem, const size_t*, const size_t,
+                                                   const size_t,
+                                                   cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t,
+                                                    const double2*,
+                                                    const cl_mem, const size_t*, const size_t,
+                                                    cl_mem, const size_t*, const size_t,
+                                                    const size_t,
+                                                    cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API AxpyBatched<half>(const size_t,
+                                                 const half*,
+                                                 const cl_mem, const size_t*, const size_t,
+                                                 cl_mem, const size_t*, const size_t,
+                                                 const size_t,
+                                                 cl_command_queue*, cl_event*);
 // =================================================================================================
 
 // Clears the cache of stored binaries
diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp
index 6018bcfa..b09f8c54 100644
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@@ -3447,6 +3447,113 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran
   } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
 
+// AXPY
+CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
+                                      const float *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<float>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
+                                      const double *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<double>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
+                                      const cl_float2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<float2>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(float2{alphas[batch].s[0], alphas[batch].s[1]});
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
+                                      const cl_double2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<double2>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(double2{alphas[batch].s[0], alphas[batch].s[1]});
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
+                                      const cl_half *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event) {
+  auto alphas_cpp = std::vector<half>();
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    alphas_cpp.push_back(alphas[batch]);
+  }
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::AxpyBatched(n,
+                           alphas_cpp.data(),
+                           x_buffer, x_offsets, x_inc,
+                           y_buffer, y_offsets, y_inc,
+                           batch_count,
+                           queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+
 // =================================================================================================
 
 // Clears the cache of stored binaries
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 41af28da..29f81cf8 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -600,9 +600,6 @@ class Buffer {
 
   // Copies from host to device: writing the device buffer a-synchronously
   void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
-    if (access_ == BufferAccess::kReadOnly) {
-      throw LogicError("Buffer: writing to a read-only buffer");
-    }
     if (GetSize() < (offset+size)*sizeof(T)) {
       throw LogicError("Buffer: target device buffer is too small");
     }
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index ece8476e..f44bbce0 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -9,7 +9,7 @@
 //
 // This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
 // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
-// support vector data-types.
+// support vector data-types. The general version has a batched implementation as well.
 //
 // This kernel uses the level-1 BLAS common tuning parameters.
 //
@@ -36,8 +36,6 @@ void Xaxpy(const int n, const real_arg arg_alpha,
   }
 }
 
-// =================================================================================================
-
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
 __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
@@ -57,6 +55,24 @@ void XaxpyFast(const int n, const real_arg arg_alpha,
 
 // =================================================================================================
 
+// Full version of the kernel with offsets and strided accesses: batched version
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
+                  const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc,
+                  __global real* ygm, const __constant int* y_offsets, const int y_inc) {
+  const int batch = get_group_id(1);
+  const real alpha = GetRealArg(arg_alphas[batch]);
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  #pragma unroll
+  for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
+    real xvalue = xgm[id*x_inc + x_offsets[batch]];
+    MultiplyAdd(ygm[id*y_inc + y_offsets[batch]], alpha, xvalue);
+  }
+}
+
+// =================================================================================================
+
 // End of the C++11 raw string literal
 )"
 
diff --git a/src/routines/levelx/xaxpybatched.cpp b/src/routines/levelx/xaxpybatched.cpp
new file mode 100644
index 00000000..6a4269be
--- /dev/null
+++ b/src/routines/levelx/xaxpybatched.cpp
@@ -0,0 +1,95 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the XaxpyBatched class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/levelx/xaxpybatched.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::string &name):
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
+    #include "../../kernels/level1/level1.opencl"
+    #include "../../kernels/level1/xaxpy.opencl"
+    }) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
+                                    const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc,
+                                    const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc,
+                                    const size_t batch_count) {
+
+  // Tests for a valid batch count
+  if ((batch_count < 1) || (alphas.size() != batch_count) ||
+      (x_offsets.size() != batch_count) || (y_offsets.size() != batch_count)) {
+    throw BLASError(StatusCode::kInvalidBatchCount);
+  }
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Tests the vectors for validity
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    TestVectorX(n, x_buffer, x_offsets[batch], x_inc);
+    TestVectorY(n, y_buffer, y_offsets[batch], y_inc);
+  }
+
+  // Upload the arguments to the device
+  std::vector<int> x_offsets_int(x_offsets.begin(), x_offsets.end());
+  std::vector<int> y_offsets_int(y_offsets.begin(), y_offsets.end());
+  auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+  auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
+  x_offsets_device.Write(queue_, batch_count, x_offsets_int);
+  y_offsets_device.Write(queue_, batch_count, y_offsets_int);
+  alphas_device.Write(queue_, batch_count, alphas);
+
+  // Retrieves the Xaxpy kernel from the compiled binary
+  auto kernel = Kernel(program_, "XaxpyBatched");
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, alphas_device());
+  kernel.SetArgument(2, x_buffer());
+  kernel.SetArgument(3, x_offsets_device());
+  kernel.SetArgument(4, static_cast<int>(x_inc));
+  kernel.SetArgument(5, y_buffer());
+  kernel.SetArgument(6, y_offsets_device());
+  kernel.SetArgument(7, static_cast<int>(y_inc));
+
+  // Launches the kernel
+  auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+  auto global = std::vector<size_t>{n_ceiled/db_["WPT"], batch_count};
+  auto local = std::vector<size_t>{db_["WGS"], 1};
+  RunKernel(kernel, queue_, device_, global, local, event_);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class XaxpyBatched<half>;
+template class XaxpyBatched<float>;
+template class XaxpyBatched<double>;
+template class XaxpyBatched<float2>;
+template class XaxpyBatched<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xaxpybatched.hpp b/src/routines/levelx/xaxpybatched.hpp
new file mode 100644
index 00000000..513792ea
--- /dev/null
+++ b/src/routines/levelx/xaxpybatched.hpp
@@ -0,0 +1,43 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the XaxpyBatched routine. This is a non-blas batched version of AXPY.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAXPYBATCHED_H_
+#define CLBLAST_ROUTINES_XAXPYBATCHED_H_
+
+#include <vector>
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class XaxpyBatched: public Routine {
+ public:
+
+  // Constructor
+  XaxpyBatched(Queue &queue, EventPointer event, const std::string &name = "AXPYBATCHED");
+
+  // Templated-precision implementation of the routine
+  void DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
+                     const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc,
+                     const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc,
+                     const size_t batch_count);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAXPYBATCHED_H_
+#endif
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 1dd76894..7060fc9f 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -17,6 +17,7 @@
 
 #include <vector>
 #include <string>
+#include <random>
 
 #include <cltune.h>
 
@@ -77,12 +78,14 @@ void Tuner(int argc, char* argv[]) {
   auto b_mat = std::vector<T>(C::GetSizeB(args));
   auto c_mat = std::vector<T>(C::GetSizeC(args));
   auto temp = std::vector<T>(C::GetSizeTemp(args));
-  PopulateVector(x_vec, kSeed);
-  PopulateVector(y_vec, kSeed);
-  PopulateVector(a_mat, kSeed);
-  PopulateVector(b_mat, kSeed);
-  PopulateVector(c_mat, kSeed);
-  PopulateVector(temp, kSeed);
+  std::mt19937 mt(kSeed);
+  std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
+  PopulateVector(x_vec, mt, dist);
+  PopulateVector(y_vec, mt, dist);
+  PopulateVector(a_mat, mt, dist);
+  PopulateVector(b_mat, mt, dist);
+  PopulateVector(c_mat, mt, dist);
+  PopulateVector(temp, mt, dist);
 
   // Initializes the tuner for the chosen device
   cltune::Tuner tuner(args.platform_id, args.device_id);
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index d68cc1a6..3d091b64 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -67,8 +67,8 @@ template <> double2 Constant(const double val) { return {val, 0.0}; }
 template <typename T> T SmallConstant() { return static_cast<T>(1e-4); }
 template float SmallConstant<float>();
 template double SmallConstant<double>();
-template <> half SmallConstant() { return FloatToHalf(1e-4); }
-template <> float2 SmallConstant() { return {1e-4, 0.0f}; }
+template <> half SmallConstant() { return FloatToHalf(1e-4f); }
+template <> float2 SmallConstant() { return {1e-4f, 0.0f}; }
 template <> double2 SmallConstant() { return {1e-4, 0.0}; }
 
 // Returns the absolute value of a scalar (modulus in case of a complex number)
@@ -326,42 +326,29 @@ unsigned int GetRandomSeed() {
 
 // Create a random number generator and populates a vector with samples from a random distribution
 template <typename T>
-void PopulateVector(std::vector<T> &vector, const unsigned int seed) {
-  auto lower_limit = static_cast<T>(kTestDataLowerLimit);
-  auto upper_limit = static_cast<T>(kTestDataUpperLimit);
-  std::mt19937 mt(seed);
-  std::uniform_real_distribution<T> dist(lower_limit, upper_limit);
-  for (auto &element: vector) { element = dist(mt); }
+void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) {
+  for (auto &element: vector) { element = static_cast<T>(dist(mt)); }
 }
-template void PopulateVector<float>(std::vector<float>&, const unsigned int);
-template void PopulateVector<double>(std::vector<double>&, const unsigned int);
+template void PopulateVector<float>(std::vector<float>&, std::mt19937&, std::uniform_real_distribution<double>&);
+template void PopulateVector<double>(std::vector<double>&, std::mt19937&, std::uniform_real_distribution<double>&);
 
 // Specialized versions of the above for complex data-types
 template <>
-void PopulateVector(std::vector<float2> &vector, const unsigned int seed) {
-  auto lower_limit = static_cast<float>(kTestDataLowerLimit);
-  auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(seed);
-  std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
-  for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
+void PopulateVector(std::vector<float2> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) {
+  for (auto &element: vector) {
+    element.real(static_cast<float>(dist(mt)));
+    element.imag(static_cast<float>(dist(mt)));
+  }
 }
 template <>
-void PopulateVector(std::vector<double2> &vector, const unsigned int seed) {
-  auto lower_limit = static_cast<double>(kTestDataLowerLimit);
-  auto upper_limit = static_cast<double>(kTestDataUpperLimit);
-  std::mt19937 mt(seed);
-  std::uniform_real_distribution<double> dist(lower_limit, upper_limit);
+void PopulateVector(std::vector<double2> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) {
   for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
 }
 
 // Specialized versions of the above for half-precision
 template <>
-void PopulateVector(std::vector<half> &vector, const unsigned int seed) {
-  const auto lower_limit = static_cast<float>(kTestDataLowerLimit);
-  const auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(seed);
-  std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
-  for (auto &element: vector) { element = FloatToHalf(dist(mt)); }
+void PopulateVector(std::vector<half> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) {
+  for (auto &element: vector) { element = FloatToHalf(static_cast<float>(dist(mt))); }
 }
 
 // =================================================================================================
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 3c9be6a2..b3db8c22 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -20,6 +20,7 @@
 #include <string>
 #include <functional>
 #include <complex>
+#include <random>
 
 #include "clpp11.hpp"
 #include "clblast.h"
@@ -72,6 +73,7 @@ constexpr auto kArgAsumOffset = "offasum";
 constexpr auto kArgImaxOffset = "offimax";
 constexpr auto kArgAlpha = "alpha";
 constexpr auto kArgBeta = "beta";
+constexpr auto kArgBatchCount = "batch_num";
 
 // The tuner-specific arguments in string form
 constexpr auto kArgFraction = "fraction";
@@ -155,6 +157,16 @@ struct Arguments {
   size_t imax_offset = 0;
   T alpha = ConstantOne<T>();
   T beta = ConstantOne<T>();
+  // Batch-specific arguments
+  size_t batch_count = 1;
+  std::vector<size_t> x_offsets = {0};
+  std::vector<size_t> y_offsets = {0};
+  std::vector<size_t> a_offsets = {0};
+  std::vector<size_t> b_offsets = {0};
+  std::vector<size_t> c_offsets = {0};
+  std::vector<T> alphas = {ConstantOne<T>()};
+  std::vector<T> betas = {ConstantOne<T>()};
+  // Sizes
   size_t x_size = 1;
   size_t y_size = 1;
   size_t a_size = 1;
@@ -234,7 +246,7 @@ constexpr auto kTestDataUpperLimit = 2.0;
 
 // Populates a vector with random data
 template <typename T>
-void PopulateVector(std::vector<T> &vector, const unsigned int seed);
+void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist);
 
 // =================================================================================================