11 files changed, 342 insertions, 69 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf905bc8..ef6156dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -159,7 +159,7 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
                     xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
-set(LEVELX_ROUTINES xomatcopy)
+set(LEVELX_ROUTINES xomatcopy xaxpybatched)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
 set(PRECISIONS 32 64 3232 6464 16)
 
diff --git a/doc/clblast.md b/doc/clblast.md
index eda5c07f..c919169a 100644
--- a/doc/clblast.md
+++ b/doc/clblast.md
@@ -2913,8 +2913,8 @@ C++ API:
 template <typename T>
 StatusCode AxpyBatched(const size_t n,
                        const T *alphas,
-                       const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                       cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                       const cl_mem *x_buffers, const size_t x_inc,
+                       cl_mem *y_buffers, const size_t y_inc,
                        const size_t batch_count,
                        cl_command_queue* queue, cl_event* event)
 ```
@@ -2923,32 +2923,32 @@ C API:
 ```
 CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
                                       const float *alphas,
-                                      const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                      cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                      const cl_mem *x_buffers, const size_t x_inc,
+                                      cl_mem *y_buffers, const size_t y_inc,
                                       const size_t batch_count,
                                       cl_command_queue* queue, cl_event* event)
 CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
                                       const double *alphas,
-                                      const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                      cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                      const cl_mem *x_buffers, const size_t x_inc,
+                                      cl_mem *y_buffers, const size_t y_inc,
                                       const size_t batch_count,
                                       cl_command_queue* queue, cl_event* event)
 CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
                                       const cl_float2 *alphas,
-                                      const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                      cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                      const cl_mem *x_buffers, const size_t x_inc,
+                                      cl_mem *y_buffers, const size_t y_inc,
                                       const size_t batch_count,
                                       cl_command_queue* queue, cl_event* event)
 CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
                                       const cl_double2 *alphas,
-                                      const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                      cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                      const cl_mem *x_buffers, const size_t x_inc,
+                                      cl_mem *y_buffers, const size_t y_inc,
                                       const size_t batch_count,
                                       cl_command_queue* queue, cl_event* event)
 CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
                                       const cl_half *alphas,
-                                      const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                      cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                      const cl_mem *x_buffers, const size_t x_inc,
+                                      cl_mem *y_buffers, const size_t y_inc,
                                       const size_t batch_count,
                                       cl_command_queue* queue, cl_event* event)
 ```
@@ -2958,10 +2958,8 @@ Arguments to AXPYBATCHED:
 * `const size_t n`: Integer size argument. This value must be positive.
 * `const T *alphas`: Input scalar constants.
 * `const cl_mem *x_buffers`: OpenCL buffers to store the input x vectors.
-* `const size_t x_offset`: The offset in elements from the start of the input x vectors.
 * `const size_t x_inc`: Stride/increment of the input x vectors. This value must be greater than 0.
 * `cl_mem *y_buffers`: OpenCL buffers to store the output y vectors.
-* `const size_t y_offset`: The offset in elements from the start of the output y vectors.
 * `const size_t y_inc`: Stride/increment of the output y vectors. This value must be greater than 0.
 * `const size_t batch_count`: Number of batches. This value must be positive.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
diff --git a/include/clblast.h b/include/clblast.h
index aeea4e52..f3f73893 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -97,6 +97,7 @@ enum class StatusCode {
   kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small
 
   // Custom additional status codes for CLBlast
+  kInvalidBatchCount         = -2049, // The batch count needs to be positive
   kInvalidOverrideKernel     = -2048, // Trying to override parameters for an invalid kernel
   kMissingOverrideParameter  = -2047, // Missing override parameter(s) for the target kernel
   kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
@@ -613,8 +614,8 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
 template <typename T>
 StatusCode AxpyBatched(const size_t n,
                        const T *alphas,
-                       const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                       cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                       const cl_mem *x_buffers, const size_t x_inc,
+                       cl_mem *y_buffers, const size_t y_inc,
                        const size_t batch_count,
                        cl_command_queue* queue, cl_event* event = nullptr);
 
diff --git a/include/clblast_c.h b/include/clblast_c.h
index f933ef6c..5c84b5d7 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -96,6 +96,7 @@ typedef enum CLBlastStatusCode_ {
   CLBlastInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small
 
   // Custom additional status codes for CLBlast
+  CLBlastInvalidBatchCount         = -2049, // The batch count needs to be positive
   CLBlastInvalidOverrideKernel     = -2048, // Trying to override parameters for an invalid kernel
   CLBlastMissingOverrideParameter  = -2047, // Missing override parameter(s) for the target kernel
   CLBlastInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
@@ -1330,32 +1331,32 @@ CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const
 // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
 CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n,
                                                  const float *alphas,
-                                                 const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                                 cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                                 const cl_mem *x_buffers, const size_t x_inc,
+                                                 cl_mem *y_buffers, const size_t y_inc,
                                                  const size_t batch_count,
                                                  cl_command_queue* queue, cl_event* event);
 CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n,
                                                  const double *alphas,
-                                                 const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                                 cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                                 const cl_mem *x_buffers, const size_t x_inc,
+                                                 cl_mem *y_buffers, const size_t y_inc,
                                                  const size_t batch_count,
                                                  cl_command_queue* queue, cl_event* event);
 CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n,
                                                  const cl_float2 *alphas,
-                                                 const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                                 cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                                 const cl_mem *x_buffers, const size_t x_inc,
+                                                 cl_mem *y_buffers, const size_t y_inc,
                                                  const size_t batch_count,
                                                  cl_command_queue* queue, cl_event* event);
 CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n,
                                                  const cl_double2 *alphas,
-                                                 const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                                 cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                                 const cl_mem *x_buffers, const size_t x_inc,
+                                                 cl_mem *y_buffers, const size_t y_inc,
                                                  const size_t batch_count,
                                                  cl_command_queue* queue, cl_event* event);
 CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n,
                                                  const cl_half *alphas,
-                                                 const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                                 cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                                 const cl_mem *x_buffers, const size_t x_inc,
+                                                 cl_mem *y_buffers, const size_t y_inc,
                                                  const size_t batch_count,
                                                  cl_command_queue* queue, cl_event* event);
 
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 521afe28..8dd5fc0c 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -41,7 +41,7 @@ FILES = [
     "/include/clblast_netlib_c.h",
     "/src/clblast_netlib_c.cpp",
 ]
-HEADER_LINES = [121, 76, 125, 23, 29, 41, 65, 32]
+HEADER_LINES = [122, 76, 126, 23, 29, 41, 65, 32]
 FOOTER_LINES = [25, 138, 27, 38, 6, 6, 9, 2]
 HEADER_LINES_DOC = 0
 FOOTER_LINES_DOC = 63
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 9b07e671..8807fd8e 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -223,7 +223,7 @@ class Routine:
         """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""
         if name in self.inputs or name in self.outputs:
             a = [name + "_buffer" + self.b_s()]
-            b = [name + "_offset"]
+            b = [name + "_offset"] if not self.batched else []
             c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
             return [", ".join(a + b + c)]
         return []
@@ -251,7 +251,7 @@ class Routine:
         prefix = "const " if name in self.inputs else ""
         if name in self.inputs or name in self.outputs:
             a = [prefix + "cl_mem " + self.b_star() + name + "_buffer" + self.b_s()]
-            b = ["const size_t " + name + "_offset"]
+            b = ["const size_t " + name + "_offset"] if not self.batched else []
             c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
             return [", ".join(a + b + c)]
         return []
@@ -295,7 +295,7 @@ class Routine:
                 a = [name + "_buffers_cpp"]
             else:
                 a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"]
-            b = [name + "_offset"]
+            b = [name + "_offset"] if not self.batched else []
             c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
             return [", ".join(a + b + c)]
         return []
@@ -337,7 +337,7 @@ class Routine:
         prefix = "const " if (name in self.inputs) else ""
         if (name in self.inputs) or (name in self.outputs):
             a = [prefix + "cl_mem" + self.b_star()]
-            b = ["const size_t"]
+            b = ["const size_t"] if not self.batched else []
             c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else []
             return [", ".join(a + b + c)]
         return []
@@ -350,12 +350,13 @@ class Routine:
             math_name = name.upper() + " matrix" + self.b_s() if (name in self.buffers_matrix()) else name + " vector" + self.b_s()
             inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment "
             a = ["`" + prefix + "cl_mem " + self.b_star() + name + "_buffer" + self.b_s() + "`: OpenCL buffer" + self.b_s() + " to store the " + inout + " " + math_name + "."]
-            b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."]
+            b = []
+            if not self.batched:
+                b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."]
+            c = []
             if name not in self.buffers_without_ld_inc():
                 c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " +
                      inc_ld_description + "of the " + inout + " " + math_name + ". This value must be greater than 0."]
-            else:
-                c = []
             return a + b + c
         return []
 
diff --git a/src/clblast.cpp b/src/clblast.cpp
index 55562419..e9cac664 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -2178,8 +2178,8 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
 template <typename T>
 StatusCode AxpyBatched(const size_t n,
                        const T *alphas,
-                       const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                       cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                       const cl_mem *x_buffers, const size_t x_inc,
+                       cl_mem *y_buffers, const size_t y_inc,
                        const size_t batch_count,
                        cl_command_queue* queue, cl_event* event) {
   try {
@@ -2195,40 +2195,40 @@ StatusCode AxpyBatched(const size_t n,
     }
     routine.DoAxpyBatched(n,
                           alphas_cpp,
-                          x_buffers_cpp, x_offset, x_inc,
-                          y_buffers_cpp, y_offset, y_inc,
+                          x_buffers_cpp, x_inc,
+                          y_buffers_cpp, y_inc,
                           batch_count);
     return StatusCode::kSuccess;
   } catch (...) { return DispatchException(); }
 }
 template StatusCode PUBLIC_API AxpyBatched<float>(const size_t,
                                                   const float*,
-                                                  const cl_mem*, const size_t, const size_t,
-                                                  cl_mem*, const size_t, const size_t,
+                                                  const cl_mem*, const size_t,
+                                                  cl_mem*, const size_t,
                                                   const size_t,
                                                   cl_command_queue*, cl_event*);
 template StatusCode PUBLIC_API AxpyBatched<double>(const size_t,
                                                    const double*,
-                                                   const cl_mem*, const size_t, const size_t,
-                                                   cl_mem*, const size_t, const size_t,
+                                                   const cl_mem*, const size_t,
+                                                   cl_mem*, const size_t,
                                                    const size_t,
                                                    cl_command_queue*, cl_event*);
 template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t,
                                                    const float2*,
-                                                   const cl_mem*, const size_t, const size_t,
-                                                   cl_mem*, const size_t, const size_t,
+                                                   const cl_mem*, const size_t,
+                                                   cl_mem*, const size_t,
                                                    const size_t,
                                                    cl_command_queue*, cl_event*);
 template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t,
                                                     const double2*,
-                                                    const cl_mem*, const size_t, const size_t,
-                                                    cl_mem*, const size_t, const size_t,
+                                                    const cl_mem*, const size_t,
+                                                    cl_mem*, const size_t,
                                                     const size_t,
                                                     cl_command_queue*, cl_event*);
 template StatusCode PUBLIC_API AxpyBatched<half>(const size_t,
                                                  const half*,
-                                                 const cl_mem*, const size_t, const size_t,
-                                                 cl_mem*, const size_t, const size_t,
+                                                 const cl_mem*, const size_t,
+                                                 cl_mem*, const size_t,
                                                  const size_t,
                                                  cl_command_queue*, cl_event*);
 // =================================================================================================
diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp
index 83450e6f..bd8ea51a 100644
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@@ -3450,8 +3450,8 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran
 // AXPY
 CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
                                       const float *alphas,
-                                      const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                      cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                      const cl_mem *x_buffers, const size_t x_inc,
+                                      cl_mem *y_buffers, const size_t y_inc,
                                       const size_t batch_count,
                                       cl_command_queue* queue, cl_event* event) {
   auto alphas_cpp = std::vector<float>();
@@ -3462,8 +3462,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
     return static_cast<CLBlastStatusCode>(
       clblast::AxpyBatched(n,
                            alphas_cpp.data(),
-                           x_buffers, x_offset, x_inc,
-                           y_buffers, y_offset, y_inc,
+                           x_buffers, x_inc,
+                           y_buffers, y_inc,
                            batch_count,
                            queue, event)
     );
@@ -3471,8 +3471,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
 }
 CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
                                       const double *alphas,
-                                      const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                      cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                      const cl_mem *x_buffers, const size_t x_inc,
+                                      cl_mem *y_buffers, const size_t y_inc,
                                       const size_t batch_count,
                                       cl_command_queue* queue, cl_event* event) {
   auto alphas_cpp = std::vector<double>();
@@ -3483,8 +3483,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
     return static_cast<CLBlastStatusCode>(
       clblast::AxpyBatched(n,
                            alphas_cpp.data(),
-                           x_buffers, x_offset, x_inc,
-                           y_buffers, y_offset, y_inc,
+                           x_buffers, x_inc,
+                           y_buffers, y_inc,
                            batch_count,
                            queue, event)
     );
@@ -3492,8 +3492,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
 }
 CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
                                       const cl_float2 *alphas,
-                                      const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                      cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                      const cl_mem *x_buffers, const size_t x_inc,
+                                      cl_mem *y_buffers, const size_t y_inc,
                                       const size_t batch_count,
                                       cl_command_queue* queue, cl_event* event) {
   auto alphas_cpp = std::vector<float2>();
@@ -3504,8 +3504,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
     return static_cast<CLBlastStatusCode>(
       clblast::AxpyBatched(n,
                            alphas_cpp.data(),
-                           x_buffers, x_offset, x_inc,
-                           y_buffers, y_offset, y_inc,
+                           x_buffers, x_inc,
+                           y_buffers, y_inc,
                            batch_count,
                            queue, event)
     );
@@ -3513,8 +3513,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
 }
 CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
                                       const cl_double2 *alphas,
-                                      const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                      cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                      const cl_mem *x_buffers, const size_t x_inc,
+                                      cl_mem *y_buffers, const size_t y_inc,
                                       const size_t batch_count,
                                       cl_command_queue* queue, cl_event* event) {
   auto alphas_cpp = std::vector<double2>();
@@ -3525,8 +3525,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
     return static_cast<CLBlastStatusCode>(
       clblast::AxpyBatched(n,
                            alphas_cpp.data(),
-                           x_buffers, x_offset, x_inc,
-                           y_buffers, y_offset, y_inc,
+                           x_buffers, x_inc,
+                           y_buffers, y_inc,
                            batch_count,
                            queue, event)
     );
@@ -3534,8 +3534,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
 }
 CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
                                       const cl_half *alphas,
-                                      const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc,
-                                      cl_mem *y_buffers, const size_t y_offset, const size_t y_inc,
+                                      const cl_mem *x_buffers, const size_t x_inc,
+                                      cl_mem *y_buffers, const size_t y_inc,
                                       const size_t batch_count,
                                       cl_command_queue* queue, cl_event* event) {
   auto alphas_cpp = std::vector<half>();
@@ -3546,8 +3546,8 @@ CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
     return static_cast<CLBlastStatusCode>(
       clblast::AxpyBatched(n,
                            alphas_cpp.data(),
-                           x_buffers, x_offset, x_inc,
-                           y_buffers, y_offset, y_inc,
+                           x_buffers, x_inc,
+                           y_buffers, y_inc,
                            batch_count,
                            queue, event)
     );
diff --git a/src/routines/levelx/xaxpybatched.cpp b/src/routines/levelx/xaxpybatched.cpp
new file mode 100644
index 00000000..55458f43
--- /dev/null
+++ b/src/routines/levelx/xaxpybatched.cpp
@@ -0,0 +1,59 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the XaxpyBatched class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/levelx/xaxpybatched.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::string &name):
+    Xaxpy<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
+                                    const std::vector<Buffer<T>> &x_buffers, const size_t x_inc,
+                                    const std::vector<Buffer<T>> &y_buffers, const size_t y_inc,
+                                    const size_t batch_count) {
+  if (batch_count < 1) { throw BLASError(StatusCode::kInvalidBatchCount); }
+  if (alphas.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
+  if (x_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
+  if (y_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
+
+  // Naive implementation: calls regular Axpy multiple times
+  for (auto batch = size_t{0}; batch < batch_count; ++batch) {
+    DoAxpy(n, alphas[batch],
+           x_buffers[batch], 0, x_inc,
+           y_buffers[batch], 0, y_inc);
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class XaxpyBatched<half>;
+template class XaxpyBatched<float>;
+template class XaxpyBatched<double>;
+template class XaxpyBatched<float2>;
+template class XaxpyBatched<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xaxpybatched.hpp b/src/routines/levelx/xaxpybatched.hpp
new file mode 100644
index 00000000..7fd14a74
--- /dev/null
+++ b/src/routines/levelx/xaxpybatched.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the XaxpyBatched routine. This is a non-blas batched version of AXPY.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAXPYBATCHED_H_
+#define CLBLAST_ROUTINES_XAXPYBATCHED_H_
+
+#include <vector>
+
+#include "routines/level1/xaxpy.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class XaxpyBatched: public Xaxpy<T> {
+ public:
+
+  // Uses the regular Xaxpy routine
+  using Xaxpy<T>::DoAxpy;
+
+  // Constructor
+  XaxpyBatched(Queue &queue, EventPointer event, const std::string &name = "AXPYBATCHED");
+
+  // Templated-precision implementation of the routine
+  void DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
+                     const std::vector<Buffer<T>> &x_buffers, const size_t x_inc,
+                     const std::vector<Buffer<T>> &y_buffers, const size_t y_inc,
+                     const size_t batch_count);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAXPYBATCHED_H_
+#endif
diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp
new file mode 100644
index 00000000..181b401c
--- /dev/null
+++ b/test/routines/levelx/xaxpybatched.hpp
@@ -0,0 +1,167 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the XaxpyBatched routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+#define CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+
+#include <vector>
+#include <string>
+
+#include "utilities/utilities.hpp"
+
+#ifdef CLBLAST_REF_CLBLAS
+  #include "test/wrapper_clblas.hpp"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+  #include "test/wrapper_cblas.hpp"
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXaxpyBatched {
+ public:
+
+  // Although it is a non-BLAS routine, it can still be tested against level-1 routines in a loop
+  static size_t BLASLevel() { return 1; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgXInc, kArgYInc,
+            kArgAlpha, kArgBatchCount};
+  }
+
+  // Helper to determine a different alpha value per batch
+  static T GetAlpha(const T alpha_base, const size_t batch_id) {
+    return alpha_base + Constant<T>(batch_id);
+  }
+
+  // Describes how to obtain the sizes of the buffers (per item, not for the full batch)
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    return args.n * args.y_inc;
+  }
+
+  // Describes how to set the sizes of all the buffers (per item, not for the full batch)
+  static void SetSizes(Arguments<T> &args) {
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto alphas = std::vector<T>();
+    auto x_buffers = std::vector<cl_mem>();
+    auto y_buffers = std::vector<cl_mem>();
+    for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+      alphas.push_back(GetAlpha(args.alpha, batch));
+      x_buffers.push_back(buffers[batch].x_vec());
+      y_buffers.push_back(buffers[batch].y_vec());
+    }
+    auto status = AxpyBatched(args.n, alphas.data(),
+                              x_buffers.data(), args.x_inc,
+                              y_buffers.data(), args.y_inc,
+                              args.batch_count,
+                              &queue_plain, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CLBLAS
+    static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
+      auto queue_plain = queue();
+      for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+        auto event = cl_event{};
+        auto status = clblasXaxpy(args.n, GetAlpha(args.alpha, batch),
+                                  buffers[batch].x_vec, 0, args.x_inc,
+                                  buffers[batch].y_vec, 0, args.y_inc,
+                                  1, &queue_plain, 0, nullptr, &event);
+        clWaitForEvents(1, &event);
+        if (static_cast<StatusCode>(status) != StatusCode::kSuccess) {
+          return static_cast<StatusCode>(status);
+        }
+      }
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CBLAS
+    static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
+      for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
+        std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+        std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
+        buffers[batch].x_vec.Read(queue, args.x_size, x_vec_cpu);
+        buffers[batch].y_vec.Read(queue, args.y_size, y_vec_cpu);
+        cblasXaxpy(args.n, GetAlpha(args.alpha, batch),
+                   x_vec_cpu, 0, args.x_inc,
+                   y_vec_cpu, 0, args.y_inc);
+        buffers[batch].y_vec.Write(queue, args.y_size, y_vec_cpu);
+      }
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to download the results of the computation (per item, not for the full batch)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.Read(queue, args.y_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer (per item, not for the full batch)
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1 * args.y_inc;
+  }
+
+  // Describes how to compute performance metrics (per item, not for the full batch)
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (3 * args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_
+#endif