Merge pull request #27 from CNugteren/level2_matrix_vector

Added many level-2 matrix-vector routines
author: Cedric Nugteren <web@cedricnugteren.nl> 2015-09-26 17:02:34 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2015-09-26 17:02:34 +0200
commit: 92b4b0d1feaaf92e160fa0342daf4269f24fb4d2 (patch)
tree: 3356b67a281d8292e893028d74a1801554ce0ef2
parent: 42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff)
parent: 2b56c2c60325f02bc695cbb968049cc09205c713 (diff)
47 files changed, 2585 insertions, 296 deletions
diff --git a/.gitignore b/.gitignore
index 604b0a64..de7becef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 build
 stash
-.*
-\ No newline at end of file
+.*
+*.pyc
+\ No newline at end of file
diff --git a/CHANGELOG b/CHANGELOG
index 5a91d171..c1981d7a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,6 @@
 
 Development version (next release)
+- Improved structure and performance of level-2 routines (xSYMV/xHEMV)
 - Added level-1 routines:
   * SSWAP/DSWAP/CSWAP/ZSWAP
   * SSCAL/DSCAL/CSCAL/ZSCAL
@@ -7,6 +8,15 @@ Development version (next release)
   * SDOT/DDOT
   * CDOTU/ZDOTU
   * CDOTC/ZDOTC
+- Added level-2 routines:
+  * SGBMV/DGBMV/CGBMV/ZGBMV
+  * CHBMV/ZHBMV
+  * CHPMV/ZHPMV
+  * SSBMV/DSBMV
+  * SSPMV/DSPMV
+  * STRMV/DTRMV/CTRMV/ZTRMV
+  * STBMV/DTBMV/CTBMV/ZTBMV
+  * STPMV/DTPMV/CTPMV/ZTPMV
 
 Version 0.4.0
 - Now using the Claduc C++11 interface to OpenCL
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1960bf1d..1ddd2f77 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,7 +106,7 @@ set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm)
 set(SAMPLE_PROGRAMS_CPP sgemm)
 set(SAMPLE_PROGRAMS_C sgemm)
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
-set(LEVEL2_ROUTINES xgemv xhemv xsymv)
+set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
 set(PRECISIONS 32 3232 64 6464)
diff --git a/README.md b/README.md
index 5ffc18d6..8c7870a2 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ CLBlast: The tuned OpenCL BLAS library
 
 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
 
-__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
+__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
 
 
 Why CLBlast and not clBLAS or cuBLAS?
@@ -130,7 +130,7 @@ These graphs can be generated automatically on your own device. First, compile C
 Supported routines
 -------------
 
-CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
+CLBlast is in active development but already supports the majority of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
 
 | Level-1  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
@@ -153,16 +153,16 @@ CLBlast is in active development and currently does not support the full set of
 | Level-2  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
 | xGEMV    | ✔ | ✔ | ✔ | ✔ |         |
-| xGBMV    |   |   |   |   |         |
+| xGBMV    | ✔ | ✔ | ✔ | ✔ |         |
 | xHEMV    | - | - | ✔ | ✔ |         |
-| xHBMV    | - | - |   |   |         |
-| xHPMV    | - | - |   |   |         |
+| xHBMV    | - | - | ✔ | ✔ |         |
+| xHPMV    | - | - | ✔ | ✔ |         |
 | xSYMV    | ✔ | ✔ | - | - |         |
-| xSBMV    |   |   | - | - |         |
-| xSPMV    |   |   | - | - |         |
-| xTRMV    |   |   |   |   |         |
-| xTBMV    |   |   |   |   |         |
-| xTPMV    |   |   |   |   |         |
+| xSBMV    | ✔ | ✔ | - | - |         |
+| xSPMV    | ✔ | ✔ | - | - |         |
+| xTRMV    | ✔ | ✔ | ✔ | ✔ |         |
+| xTBMV    | ✔ | ✔ | ✔ | ✔ |         |
+| xTPMV    | ✔ | ✔ | ✔ | ✔ |         |
 | xTRSV    |   |   |   |   |         |
 | xTBSV    |   |   |   |   |         |
 | xTPSV    |   |   |   |   |         |
diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h
index 2c2cc797..df7a0d82 100644
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
@@ -493,11 +493,11 @@ class Buffer {
   }
 
   // Copies the contents of this buffer into another device buffer
-  void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) {
+  void CopyToAsync(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
     CheckError(clEnqueueCopyBuffer(queue(), *buffer_, destination(), 0, 0, size*sizeof(T), 0,
                                    nullptr, nullptr));
   }
-  void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) {
+  void CopyTo(const Queue &queue, const size_t size, const Buffer<T> &destination) const {
     CopyToAsync(queue, size, destination);
     queue.Finish();
   }
diff --git a/include/internal/routine.h b/include/internal/routine.h
index c5b253b4..b7c06a97 100644
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@@ -72,6 +72,8 @@ class Routine {
                          const size_t offset, const size_t ld, const size_t data_size);
   StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
                          const size_t offset, const size_t ld, const size_t data_size);
+  StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer,
+                          const size_t offset, const size_t data_size);
 
   // Tests for valid inputs of vectors X and Y
   StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h
new file mode 100644
index 00000000..27b033e9
--- /dev/null
+++ b/include/internal/routines/level2/xgbmv.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGBMV_H_
+#define CLBLAST_ROUTINES_XGBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgbmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n, const size_t kl, const size_t ku,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h
index 1e120a5e..b31565ec 100644
--- a/include/internal/routines/level2/xgemv.h
+++ b/include/internal/routines/level2/xgemv.h
@@ -32,6 +32,7 @@ class Xgemv: public Routine<T> {
   using Routine<T>::TestVectorX;
   using Routine<T>::TestVectorY;
   using Routine<T>::TestMatrixA;
+  using Routine<T>::TestMatrixAP;
   using Routine<T>::RunKernel;
   using Routine<T>::ErrorIn;
 
@@ -47,6 +48,18 @@ class Xgemv: public Routine<T> {
                     const T beta,
                     const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 
+  // Generic version used also for other matrix-vector multiplications
+  StatusCode MatVec(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    bool fast_kernel, bool fast_kernel_rot,
+                    const size_t parameter, const bool packed,
+                    const size_t kl, const size_t ku);
+
  private:
   // Static variable to get the precision
   const static Precision precision_;
diff --git a/include/internal/routines/level2/xhbmv.h b/include/internal/routines/level2/xhbmv.h
new file mode 100644
index 00000000..65138424
--- /dev/null
+++ b/include/internal/routines/level2/xhbmv.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHBMV_H_
+#define CLBLAST_ROUTINES_XHBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhbmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xhbmv(Queue &queue, Event &event, const std::string &name = "HBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHbmv(const Layout layout, const Triangle triangle,
+                    const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h
index 801b2fc3..b74db760 100644
--- a/include/internal/routines/level2/xhemv.h
+++ b/include/internal/routines/level2/xhemv.h
@@ -7,8 +7,9 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the Xhemv routine. It is based on the generalized matrix multiplication
-// routine (Xgemv). The implementation is very similar to the Xsymv routine.
+// This file implements the Xhemv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
 //
 // =================================================================================================
 
@@ -25,16 +26,8 @@ template <typename T>
 class Xhemv: public Xgemv<T> {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
-  // Uses the regular Xgemv routine
-  using Xgemv<T>::DoGemv;
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
 
   // Constructor
   Xhemv(Queue &queue, Event &event, const std::string &name = "HEMV");
diff --git a/include/internal/routines/level2/xhpmv.h b/include/internal/routines/level2/xhpmv.h
new file mode 100644
index 00000000..48f1ed3f
--- /dev/null
+++ b/include/internal/routines/level2/xhpmv.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPMV_H_
+#define CLBLAST_ROUTINES_XHPMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhpmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xhpmv(Queue &queue, Event &event, const std::string &name = "HPMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHpmv(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPMV_H_
+#endif
diff --git a/include/internal/routines/level2/xsbmv.h b/include/internal/routines/level2/xsbmv.h
new file mode 100644
index 00000000..bb24d8f4
--- /dev/null
+++ b/include/internal/routines/level2/xsbmv.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xsbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSBMV_H_
+#define CLBLAST_ROUTINES_XSBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsbmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xsbmv(Queue &queue, Event &event, const std::string &name = "SBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSbmv(const Layout layout, const Triangle triangle,
+                    const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xspmv.h b/include/internal/routines/level2/xspmv.h
new file mode 100644
index 00000000..88f02a2f
--- /dev/null
+++ b/include/internal/routines/level2/xspmv.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xspmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPMV_H_
+#define CLBLAST_ROUTINES_XSPMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xspmv(Queue &queue, Event &event, const std::string &name = "SPMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSpmv(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPMV_H_
+#endif
diff --git a/include/internal/routines/level2/xsymv.h b/include/internal/routines/level2/xsymv.h
index ab6da6d1..c7b92702 100644
--- a/include/internal/routines/level2/xsymv.h
+++ b/include/internal/routines/level2/xsymv.h
@@ -9,8 +9,7 @@
 //
 // This file implements the Xsymv routine. It is based on the generalized mat-vec multiplication
 // routine (Xgemv). The Xsymv class inherits from the templated class Xgemv, allowing it to call the
-// "DoGemm" function directly. The "DoSymv" function first preprocesses the symmetric matrix by
-// transforming it into a general matrix, and then calls the regular GEMV code.
+// "MatVec" function directly.
 //
 // =================================================================================================
 
@@ -27,16 +26,8 @@ template <typename T>
 class Xsymv: public Xgemv<T> {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
-  // Uses the regular Xgemv routine
-  using Xgemv<T>::DoGemv;
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
 
   // Constructor
   Xsymv(Queue &queue, Event &event, const std::string &name = "SYMV");
diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h
new file mode 100644
index 00000000..89c90193
--- /dev/null
+++ b/include/internal/routines/level2/xtbmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTBMV_H_
+#define CLBLAST_ROUTINES_XTBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtbmv: public Xgemv<T> {
+ public:
+  
+  // Members from the base class
+  using Routine<T>::queue_;
+  using Routine<T>::context_;
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTbmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n, const size_t k,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h
new file mode 100644
index 00000000..183d3505
--- /dev/null
+++ b/include/internal/routines/level2/xtpmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTPMV_H_
+#define CLBLAST_ROUTINES_XTPMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtpmv: public Xgemv<T> {
+ public:
+  
+  // Members from the base class
+  using Routine<T>::queue_;
+  using Routine<T>::context_;
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTpmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTPMV_H_
+#endif
diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h
new file mode 100644
index 00000000..dadfbc98
--- /dev/null
+++ b/include/internal/routines/level2/xtrmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMV_H_
+#define CLBLAST_ROUTINES_XTRMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmv: public Xgemv<T> {
+ public:
+  
+  // Members from the base class
+  using Routine<T>::queue_;
+  using Routine<T>::context_;
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTrmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMV_H_
+#endif
diff --git a/include/internal/utilities.h b/include/internal/utilities.h
index 466ac810..bd174ccb 100644
--- a/include/internal/utilities.h
+++ b/include/internal/utilities.h
@@ -41,6 +41,8 @@ const std::string kKhronosDoublePrecision = "cl_khr_fp64";
 constexpr auto kArgM = "m";
 constexpr auto kArgN = "n";
 constexpr auto kArgK = "k";
+constexpr auto kArgKL = "kl";
+constexpr auto kArgKU = "ku";
 constexpr auto kArgLayout = "layout";
 constexpr auto kArgATransp = "transA";
 constexpr auto kArgBTransp = "transB";
@@ -57,6 +59,7 @@ constexpr auto kArgCLeadDim = "ldc";
 constexpr auto kArgAOffset = "offa";
 constexpr auto kArgBOffset = "offb";
 constexpr auto kArgCOffset = "offc";
+constexpr auto kArgAPOffset = "offap";
 constexpr auto kArgDotOffset = "offdot";
 constexpr auto kArgAlpha = "alpha";
 constexpr auto kArgBeta = "beta";
@@ -87,9 +90,11 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
 template <typename T>
 struct Arguments {
   // Routine-specific arguments
-  size_t m = 0;
-  size_t n = 0;
-  size_t k = 0;
+  size_t m = 1;
+  size_t n = 1;
+  size_t k = 1;
+  size_t ku = 1;
+  size_t kl = 1;
   Layout layout = Layout::kRowMajor;
   Transpose a_transpose = Transpose::kNo;
   Transpose b_transpose = Transpose::kNo;
@@ -100,12 +105,13 @@ struct Arguments {
   size_t y_inc = 1;
   size_t x_offset = 0;
   size_t y_offset = 0;
-  size_t a_ld = 0;
-  size_t b_ld = 0;
-  size_t c_ld = 0;
+  size_t a_ld = 1;
+  size_t b_ld = 1;
+  size_t c_ld = 1;
   size_t a_offset = 0;
   size_t b_offset = 0;
   size_t c_offset = 0;
+  size_t ap_offset = 0;
   size_t dot_offset = 0;
   T alpha = T{1.0};
   T beta = T{1.0};
@@ -114,6 +120,7 @@ struct Arguments {
   size_t a_size = 1;
   size_t b_size = 1;
   size_t c_size = 1;
+  size_t ap_size = 1;
   size_t dot_size = 1;
   // Tuner-specific arguments
   double fraction = 1.0;
@@ -141,6 +148,7 @@ struct Buffers {
   Buffer<T> a_mat;
   Buffer<T> b_mat;
   Buffer<T> c_mat;
+  Buffer<T> ap_mat;
   Buffer<T> dot;
 };
 
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 677c8afc..25f02861 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -64,16 +64,16 @@ routines = [
 ],
 [ # Level 2: matrix-vector
   Routine(True,  "2a", "gemv",  T,  [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"),
-  Routine(False, "2a", "gbmv",  T,  [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"),
+  Routine(True,  "2a", "gbmv",  T,  [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"),
   Routine(True,  "2a", "hemv",  T,  [C,Z],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"),
-  Routine(False, "2a", "hbmv",  T,  [C,Z],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"),
-  Routine(False, "2a", "hpmv",  T,  [C,Z],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"),
+  Routine(True,  "2a", "hbmv",  T,  [C,Z],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"),
+  Routine(True,  "2a", "hpmv",  T,  [C,Z],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"),
   Routine(True,  "2a", "symv",  T,  [S,D],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"),
-  Routine(False, "2a", "sbmv",  T,  [S,D],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"),
-  Routine(False, "2a", "spmv",  T,  [S,D],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"),
-  Routine(False, "2a", "trmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"),
-  Routine(False, "2a", "tbmv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"),
-  Routine(False, "2a", "tpmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"),
+  Routine(True,  "2a", "sbmv",  T,  [S,D],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"),
+  Routine(True,  "2a", "spmv",  T,  [S,D],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"),
+  Routine(True,  "2a", "trmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"),
+  Routine(True,  "2a", "tbmv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"),
+  Routine(True,  "2a", "tpmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"),
   Routine(False, "2a", "trsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"),
   Routine(False, "2a", "tbsv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
   Routine(False, "2a", "tpsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
@@ -213,7 +213,7 @@ def wrapper_clblas(routines):
 			if routine.scratch:
 				result += "  auto queue = Queue(queues[0]);\n"
 				result += "  auto context = queue.GetContext();\n"
-				result += "  auto scratch_buffer = Buffer<"+flavour.template+">(context, n);\n"
+				result += "  auto scratch_buffer = Buffer<"+flavour.template+">(context, n*x_inc + x_offset);\n"
 				arguments += ["scratch_buffer()"]
 			result += "  return clblas"+flavour.name+routine.name+"("
 			result += (",\n"+indent).join([a for a in arguments])
@@ -237,7 +237,7 @@ files = [
   path_clblast+"/src/clblast_c.cc",
   path_clblast+"/test/wrapper_clblas.h",
 ]
-header_lines = [84, 44, 80, 24, 22]
+header_lines = [84, 52, 80, 24, 22]
 footer_lines = [6, 3, 5, 2, 6]
 
 # Checks whether the command-line arguments are valid; exists otherwise
diff --git a/src/clblast.cc b/src/clblast.cc
index a0dd8c70..77999aaf 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -28,8 +28,16 @@
 
 // BLAS level-2 includes
 #include "internal/routines/level2/xgemv.h"
+#include "internal/routines/level2/xgbmv.h"
 #include "internal/routines/level2/xhemv.h"
+#include "internal/routines/level2/xhbmv.h"
+#include "internal/routines/level2/xhpmv.h"
 #include "internal/routines/level2/xsymv.h"
+#include "internal/routines/level2/xsbmv.h"
+#include "internal/routines/level2/xspmv.h"
+#include "internal/routines/level2/xtrmv.h"
+#include "internal/routines/level2/xtbmv.h"
+#include "internal/routines/level2/xtpmv.h"
 
 // BLAS level-3 includes
 #include "internal/routines/level3/xgemm.h"
@@ -327,15 +335,26 @@ template StatusCode Gemv<double2>(const Layout, const Transpose,
 
 // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
 template <typename T>
-StatusCode Gbmv(const Layout, const Transpose,
-                const size_t, const size_t, const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                const T,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
+                const size_t m, const size_t n, const size_t kl, const size_t ku,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xgbmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoGbmv(layout, a_transpose,
+                        m, n, kl, ku,
+                        alpha,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        beta,
+                        Buffer<T>(y_buffer), y_offset, y_inc);
 }
 template StatusCode Gbmv<float>(const Layout, const Transpose,
                                 const size_t, const size_t, const size_t, const size_t,
@@ -412,15 +431,26 @@ template StatusCode Hemv<double2>(const Layout, const Triangle,
 
 // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
 template <typename T>
-StatusCode Hbmv(const Layout, const Triangle,
-                const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                const T,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Hbmv(const Layout layout, const Triangle triangle,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xhbmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoHbmv(layout, triangle,
+                        n, k,
+                        alpha,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        beta,
+                        Buffer<T>(y_buffer), y_offset, y_inc);
 }
 template StatusCode Hbmv<float2>(const Layout, const Triangle,
                                  const size_t, const size_t,
@@ -441,15 +471,26 @@ template StatusCode Hbmv<double2>(const Layout, const Triangle,
 
 // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
 template <typename T>
-StatusCode Hpmv(const Layout, const Triangle,
-                const size_t,
-                const T,
-                const cl_mem, const size_t,
-                const cl_mem, const size_t, const size_t,
-                const T,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Hpmv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xhpmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoHpmv(layout, triangle,
+                        n,
+                        alpha,
+                        Buffer<T>(ap_buffer), ap_offset,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        beta,
+                        Buffer<T>(y_buffer), y_offset, y_inc);
 }
 template StatusCode Hpmv<float2>(const Layout, const Triangle,
                                  const size_t,
@@ -510,15 +551,26 @@ template StatusCode Symv<double>(const Layout, const Triangle,
 
 // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
 template <typename T>
-StatusCode Sbmv(const Layout, const Triangle,
-                const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                const T,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Sbmv(const Layout layout, const Triangle triangle,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xsbmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoSbmv(layout, triangle,
+                        n, k,
+                        alpha,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        beta,
+                        Buffer<T>(y_buffer), y_offset, y_inc);
 }
 template StatusCode Sbmv<float>(const Layout, const Triangle,
                                 const size_t, const size_t,
@@ -539,15 +591,26 @@ template StatusCode Sbmv<double>(const Layout, const Triangle,
 
 // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
 template <typename T>
-StatusCode Spmv(const Layout, const Triangle,
-                const size_t,
-                const T,
-                const cl_mem, const size_t,
-                const cl_mem, const size_t, const size_t,
-                const T,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Spmv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xspmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoSpmv(layout, triangle,
+                        n,
+                        alpha,
+                        Buffer<T>(ap_buffer), ap_offset,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        beta,
+                        Buffer<T>(y_buffer), y_offset, y_inc);
 }
 template StatusCode Spmv<float>(const Layout, const Triangle,
                                 const size_t,
@@ -568,12 +631,20 @@ template StatusCode Spmv<double>(const Layout, const Triangle,
 
 // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
 template <typename T>
-StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtrmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoTrmv(layout, triangle, a_transpose, diagonal,
+                        n,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc);
 }
 template StatusCode Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                 const size_t,
@@ -598,12 +669,20 @@ template StatusCode Trmv<double2>(const Layout, const Triangle, const Transpose,
 
 // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
 template <typename T>
-StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n, const size_t k,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtbmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoTbmv(layout, triangle, a_transpose, diagonal,
+                        n, k,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc);
 }
 template StatusCode Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                 const size_t, const size_t,
@@ -628,12 +707,20 @@ template StatusCode Tbmv<double2>(const Layout, const Triangle, const Transpose,
 
 // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
 template <typename T>
-StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t,
-                const cl_mem, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtpmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoTpmv(layout, triangle, a_transpose, diagonal,
+                        n,
+                        Buffer<T>(ap_buffer), ap_offset,
+                        Buffer<T>(x_buffer), x_offset, x_inc);
 }
 template StatusCode Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                 const size_t,
diff --git a/src/kernels/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 1e12dd78..8ed0e9e4 100644
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -79,22 +79,189 @@ R"(
 #endif
 
 // =================================================================================================
-// Defines how to load the input matrix in the regular case
 
-// Loads a scalar input value
+// Defines how to load the input matrix in the non-vectorized case
 inline real LoadMatrixA(const __global real* restrict agm, const int x, const int y,
-                        const int a_ld, const int a_offset) {
-  return agm[x + a_ld*y + a_offset];
+                        const int a_ld, const int a_offset, const int parameter,
+                        const int kl, const int ku) {
+  real result;
+
+  // For banded matrices
+  #if defined(ROUTINE_GBMV)
+    const int k = ku - y;
+    if (x >= y-ku && x < y+kl+1) { result = agm[a_ld*y + k + x + a_offset]; }
+    else { SetToZero(result); }
+
+  // For symmetric/hermitian matrices
+  #elif defined(ROUTINE_HEMV) || defined(ROUTINE_SYMV)
+    if ((parameter == 0 && y <= x) || (parameter == 1 && x <= y)) {
+      result = agm[a_ld*y + x + a_offset];
+      #if defined(ROUTINE_HEMV)
+        if (x == y) { result.y = ZERO; }
+      #endif
+    }
+    else {
+      result = agm[a_ld*x + y + a_offset];
+      #if defined(ROUTINE_HEMV)
+        COMPLEX_CONJUGATE(result);
+      #endif
+    }
+
+  // For triangular matrices
+  #elif defined(ROUTINE_TRMV)
+    if (((parameter == 0 || parameter == 2) && y <= x) ||
+        ((parameter == 1 || parameter == 3) && x <= y)) {
+      result = agm[a_ld*y + x + a_offset];
+      if (parameter >= 2 && y == x) {
+        SetToOne(result);
+      }
+    }
+    else {
+      SetToZero(result);
+    }
+
+  // For symmetric/hermitian banded matrices
+  #elif defined(ROUTINE_HBMV) || defined(ROUTINE_SBMV)
+    if (parameter == 1) {
+      if (x <= y) {
+        const int m = kl - y;
+        if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; }
+        else { SetToZero(result); }
+        #if defined(ROUTINE_HBMV)
+          if (x == y) { result.y = ZERO; }
+        #endif
+      }
+      else {
+        const int m = kl - x;
+        if (y >= x-kl && y <= x) { result = agm[a_ld*x + m + y + a_offset]; }
+        else { SetToZero(result); }
+        #if defined(ROUTINE_HBMV)
+          COMPLEX_CONJUGATE(result);
+        #endif
+      }
+    }
+    else {
+      if (x >= y) {
+        const int m = -y;
+        if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; }
+        else { SetToZero(result); }
+        #if defined(ROUTINE_HBMV)
+          if (x == y) { result.y = ZERO; }
+        #endif
+      }
+      else {
+        const int m = -x;
+        if (y >= x && y < x+kl+1) { result = agm[a_ld*x + m + y + a_offset]; }
+        else { SetToZero(result); }
+        #if defined(ROUTINE_HBMV)
+          COMPLEX_CONJUGATE(result);
+        #endif
+      }
+    }
+
+  // For triangular banded matrices
+  #elif defined(ROUTINE_TBMV)
+    if (parameter == 1 || parameter == 3) {
+      if (x <= y) {
+        const int m = kl - y;
+        if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; }
+        else { SetToZero(result); }
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+    else {
+      if (x >= y) {
+        const int m = -y;
+        if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; }
+        else { SetToZero(result); }
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+
+  // For symmetric/hermitian packed matrices
+  #elif defined(ROUTINE_HPMV) || defined(ROUTINE_SPMV)
+    if (parameter == 1) {
+      if (x <= y) {
+        result = agm[((y+1)*y)/2 + x + a_offset];
+        #if defined(ROUTINE_HPMV)
+          if (x == y) { result.y = ZERO; }
+        #endif
+      }
+      else {
+        result = agm[((x+1)*x)/2 + y + a_offset];
+        #if defined(ROUTINE_HPMV)
+          COMPLEX_CONJUGATE(result);
+        #endif
+      }
+    }
+    else {
+      if (x >= y) {
+        result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset];
+        #if defined(ROUTINE_HPMV)
+          if (x == y) { result.y = ZERO; }
+        #endif
+      }
+      else {
+        result = agm[((2*a_ld-(x+1))*x)/2 + y + a_offset];
+        #if defined(ROUTINE_HPMV)
+          COMPLEX_CONJUGATE(result);
+        #endif
+      }
+    }
+
+  // For triangular packed matrices
+  #elif defined(ROUTINE_TPMV)
+    if (parameter == 1 || parameter == 3) {
+      if (x <= y) {
+        result = agm[((y+1)*y)/2 + x + a_offset];
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+    else {
+      if (x >= y) {
+        result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset];
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+
+  // For general matrices
+  #else
+    result = agm[a_ld*y + x + a_offset];
+  #endif
+
+  return result;
 }
+
 // Loads a vector input value (1/2)
 inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
                             const int a_ld) {
-  return agm[x + a_ld*y];
+  return agm[a_ld*y + x];
 }
+
 // Loads a vector input value (2/2): as before, but different data-type
 inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
                               const int a_ld) {
-  return agm[x + a_ld*y];
+  return agm[a_ld*y + x];
 }
 
 // =================================================================================================
@@ -106,7 +273,8 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
                     const __global real* restrict agm, const int a_offset, const int a_ld,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc,
-                    const int do_conjugate) {
+                    const int do_conjugate, const int parameter,
+                    const int kl, const int ku) {
 
   // Local memory for the vector X
   __local real xlm[WGS1];
@@ -141,20 +309,20 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
         // The multiply-add function for the main part (divisable by WGS1)
         if (a_rotated == 0) { // Not rotated
           #pragma unroll
-          for (int kl=0; kl<WGS1; ++kl) {
-            const int k = kwg + kl;
-            real value = LoadMatrixA(agm, gid, k, a_ld, a_offset);
+          for (int kloop=0; kloop<WGS1; ++kloop) {
+            const int k = kwg + kloop;
+            real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
             if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-            MultiplyAdd(acc[w], xlm[kl], value);
+            MultiplyAdd(acc[w], xlm[kloop], value);
           }
         }
         else { // Transposed
           #pragma unroll
-          for (int kl=0; kl<WGS1; ++kl) {
-            const int k = kwg + kl;
-            real value = LoadMatrixA(agm, k, gid, a_ld, a_offset);
+          for (int kloop=0; kloop<WGS1; ++kloop) {
+            const int k = kwg + kloop;
+            real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
             if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-            MultiplyAdd(acc[w], xlm[kl], value);
+            MultiplyAdd(acc[w], xlm[kloop], value);
           }
         }
       }
@@ -174,7 +342,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
       if (a_rotated == 0) { // Not rotated
         #pragma unroll
         for (int k=n_floor; k<n; ++k) {
-          real value = LoadMatrixA(agm, gid, k, a_ld, a_offset);
+          real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
           if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
           MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value);
         }
@@ -182,7 +350,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
       else { // Transposed
         #pragma unroll
         for (int k=n_floor; k<n; ++k) {
-          real value = LoadMatrixA(agm, k, gid, a_ld, a_offset);
+          real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
           if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
           MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value);
         }
@@ -209,7 +377,8 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b
                         const __global realVF* restrict agm, const int a_offset, const int a_ld,
                         const __global real* restrict xgm, const int x_offset, const int x_inc,
                         __global real* ygm, const int y_offset, const int y_inc,
-                        const int do_conjugate) {
+                        const int do_conjugate, const int parameter,
+                        const int kl, const int ku) {
   // Local memory for the vector X
   __local real xlm[WGS2];
 
@@ -305,7 +474,8 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea
                            const __global realVFR* restrict agm, const int a_offset, const int a_ld,
                            const __global real* restrict xgm, const int x_offset, const int x_inc,
                            __global real* ygm, const int y_offset, const int y_inc,
-                           const int do_conjugate) {
+                           const int do_conjugate, const int parameter,
+                           const int kl, const int ku) {
   // Local memory for the vector X
   __local real xlm[WGS3];
 
diff --git a/src/routine.cc b/src/routine.cc
index 05a03683..2978c94a 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -191,6 +191,18 @@ StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buf
   return StatusCode::kSuccess;
 }
 
+// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size
+template <typename T>
+StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer,
+                                    const size_t offset, const size_t data_size) {
+  try {
+    auto required_size = (((n*(n+1))/2) + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
+  } catch (...) { return StatusCode::kInvalidMatrixA; }
+  return StatusCode::kSuccess;
+}
+
 // =================================================================================================
 
 // Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc
new file mode 100644
index 00000000..14d391ca
--- /dev/null
+++ b/src/routines/level2/xgbmv.cc
@@ -0,0 +1,67 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xgbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n, const size_t kl, const size_t ku,
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // Reverses the upper and lower band count
+  auto rotated = (layout == Layout::kRowMajor);
+  auto kl_real = (rotated) ? ku : kl;
+  auto ku_real = (rotated) ? kl : ku;
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_GBMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, a_transpose,
+                m, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                0, false, kl_real, ku_real);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgbmv<float>;
+template class Xgbmv<double>;
+template class Xgbmv<float2>;
+template class Xgbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
index f95a9957..1b768dcd 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -32,8 +32,7 @@ template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
     Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
   source_string_ =
-    #include "../../kernels/pad.opencl" // For {Herm,Symm}{Upper,Lower}ToSquared (for HEMV/SYMV)
-    #include "../../kernels/xgemv.opencl"
+    #include "../../kernels/level2/xgemv.opencl"
   ;
 }
 
@@ -49,6 +48,31 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
                             const T beta,
                             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
 
+  // Performs the matrix-vector multiplication
+  return MatVec(layout, a_transpose,
+                m, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                true, true,
+                0, false, 0, 0); // N/A for this routine
+}
+
+// =================================================================================================
+
+// The generic implementation, also suited for other (non general) matrix-vector multiplications
+template <typename T>
+StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                            bool fast_kernel, bool fast_kernel_rot,
+                            const size_t parameter, const bool packed,
+                            const size_t kl, const size_t ku) {
+
   // Makes sure all dimensions are larger than zero
   if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
 
@@ -62,6 +86,11 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
   auto m_real = (a_transposed) ? n : m;
   auto n_real = (a_transposed) ? m : n;
 
+  // Special adjustments for banded matrices
+  if (kl != 0 || ku != 0) {
+    a_one = kl+ku+1;
+  }
+
   // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
   auto a_rotated = a_transposed ^ a_altlayout;
 
@@ -69,7 +98,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
   auto a_conjugate = (a_transpose == Transpose::kConjugate);
 
   // Tests the matrix and the vectors for validity
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = StatusCode::kSuccess;
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
+  else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); }
   if (ErrorIn(status)) { return status; }
   status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
   if (ErrorIn(status)) { return status; }
@@ -77,26 +108,26 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
   if (ErrorIn(status)) { return status; }
 
   // Determines whether or not the fast-version can be used
-  bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
-                         IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
-                         IsMultiple(n, db_["WGS2"]) &&
-                         IsMultiple(a_ld, db_["VW2"]);
-  bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
-                             IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
-                             IsMultiple(n, db_["WGS3"]) &&
-                             IsMultiple(a_ld, db_["VW3"]);
+  fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
+                IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
+                IsMultiple(n, db_["WGS2"]) &&
+                IsMultiple(a_ld, db_["VW2"]);
+  fast_kernel_rot = fast_kernel_rot && (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
+                    IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
+                    IsMultiple(n, db_["WGS3"]) &&
+                    IsMultiple(a_ld, db_["VW3"]);
 
   // If possible, run the fast-version (rotated or non-rotated) of the kernel
   auto kernel_name = "Xgemv";
   auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
   auto global_size = m_ceiled / db_["WPT1"];
   auto local_size = db_["WGS1"];
-  if (use_fast_kernel) {
+  if (fast_kernel) {
     kernel_name = "XgemvFast";
     global_size = m_real / db_["WPT2"];
     local_size = db_["WGS2"];
   }
-  if (use_fast_kernel_rot) {
+  if (fast_kernel_rot) {
     kernel_name = "XgemvFastRot";
     global_size = m_real / db_["WPT3"];
     local_size = db_["WGS3"];
@@ -123,6 +154,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
     kernel.SetArgument(12, static_cast<int>(y_offset));
     kernel.SetArgument(13, static_cast<int>(y_inc));
     kernel.SetArgument(14, static_cast<int>(a_conjugate));
+    kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
+    kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
+    kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
 
     // Launches the kernel
     auto global = std::vector<size_t>{global_size};
diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc
new file mode 100644
index 00000000..f59a7cb3
--- /dev/null
+++ b/src/routines/level2/xhbmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xhbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhbmv<T>::Xhbmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
+                            const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_HBMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, false, k, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhbmv<float2>;
+template class Xhbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc
index 2d92e45f..5a58b28b 100644
--- a/src/routines/level2/xhemv.cc
+++ b/src/routines/level2/xhemv.cc
@@ -37,57 +37,21 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
                             const T beta,
                             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
 
-  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
-
-  // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
-  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
-
-  // Temporary buffer for a copy of the hermitian matrix
-  try {
-    auto temp_herm = Buffer<T>(context_, n*n);
-
-    // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemv
-    // routine afterwards
-    try {
-      auto& program = GetProgramFromCache();
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the hermitian-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(n));
-      kernel.SetArgument(5, static_cast<int>(n));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_herm());
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // hermitian-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the regular Xgemv code
-      status = DoGemv(layout, Transpose::kNo, n, n, alpha,
-                      temp_herm, 0, n,
-                      x_buffer, x_offset, x_inc, beta,
-                      y_buffer, y_offset, y_inc);
-
-      // Return the status of the Xgemv routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_HEMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, false, 0, 0);
 }
 
 // =================================================================================================
diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc
new file mode 100644
index 00000000..2269255d
--- /dev/null
+++ b/src/routines/level2/xhpmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xhpmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhpmv<T>::Xhpmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
+                            const size_t n,
+                            const T alpha,
+                            const Buffer<T> &ap_buffer, const size_t ap_offset,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_HPMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                ap_buffer, ap_offset, n,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, true, 0, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhpmv<float2>;
+template class Xhpmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc
new file mode 100644
index 00000000..457bd762
--- /dev/null
+++ b/src/routines/level2/xsbmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xsbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsbmv<T>::Xsbmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
+                            const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_SBMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, false, k, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsbmv<float>;
+template class Xsbmv<double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc
new file mode 100644
index 00000000..4f1a9c61
--- /dev/null
+++ b/src/routines/level2/xspmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xspmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xspmv<T>::Xspmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
+                            const size_t n,
+                            const T alpha,
+                            const Buffer<T> &ap_buffer, const size_t ap_offset,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_SPMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                ap_buffer, ap_offset, n,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, true, 0, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xspmv<float>;
+template class Xspmv<double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc
index 2ccb51f6..ec12324b 100644
--- a/src/routines/level2/xsymv.cc
+++ b/src/routines/level2/xsymv.cc
@@ -37,57 +37,21 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
                             const T beta,
                             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
 
-  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
-
-  // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
-  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
-
-  // Temporary buffer for a copy of the symmetric matrix
-  try {
-    auto temp_symm = Buffer<T>(context_, n*n);
-
-    // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemv
-    // routine afterwards
-    try {
-      auto& program = GetProgramFromCache();
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the symmetric-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(n));
-      kernel.SetArgument(5, static_cast<int>(n));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_symm());
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // symmetric-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the regular Xgemv code
-      status = DoGemv(layout, Transpose::kNo, n, n, alpha,
-                      temp_symm, 0, n,
-                      x_buffer, x_offset, x_inc, beta,
-                      y_buffer, y_offset, y_inc);
-
-      // Return the status of the Xgemv routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific symmetric matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_SYMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, false, 0, 0);
 }
 
 // =================================================================================================
diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc
new file mode 100644
index 00000000..2e1aebff
--- /dev/null
+++ b/src/routines/level2/xtbmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtbmv<T>::Xtbmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n, const size_t k,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Creates a copy of X: a temporary scratch buffer
+  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+  try {
+    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+  } catch (...) { } // Continues: error-code is returned in MatVec
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Adds '2' to the parameter if the diagonal is unit
+  auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_TBMV define.
+  auto fast_kernels = false;
+  auto status = MatVec(layout, a_transpose,
+                       n, n, static_cast<T>(1),
+                       a_buffer, a_offset, a_ld,
+                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+                       x_buffer, x_offset, x_inc,
+                       fast_kernels, fast_kernels,
+                       parameter, false, k, 0);
+
+  // Returns the proper error code (renames vector Y to X)
+  switch(status) {
+    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
+    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
+    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+    default: return status;
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtbmv<float>;
+template class Xtbmv<double>;
+template class Xtbmv<float2>;
+template class Xtbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc
new file mode 100644
index 00000000..aa0e099b
--- /dev/null
+++ b/src/routines/level2/xtpmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtpmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtpmv<T>::Xtpmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n,
+                            const Buffer<T> &ap_buffer, const size_t ap_offset,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Creates a copy of X: a temporary scratch buffer
+  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+  try {
+    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+  } catch (...) { } // Continues: error-code is returned in MatVec
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Adds '2' to the parameter if the diagonal is unit
+  auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_TPMV define.
+  auto fast_kernels = false;
+  auto status = MatVec(layout, a_transpose,
+                       n, n, static_cast<T>(1),
+                       ap_buffer, ap_offset, n,
+                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+                       x_buffer, x_offset, x_inc,
+                       fast_kernels, fast_kernels,
+                       parameter, true, 0, 0);
+
+  // Returns the proper error code (renames vector Y to X)
+  switch(status) {
+    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
+    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
+    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+    default: return status;
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtpmv<float>;
+template class Xtpmv<double>;
+template class Xtpmv<float2>;
+template class Xtpmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc
new file mode 100644
index 00000000..94424743
--- /dev/null
+++ b/src/routines/level2/xtrmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtrmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrmv<T>::Xtrmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Creates a copy of X: a temporary scratch buffer
+  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+  try {
+    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+  } catch (...) { } // Continues: error-code is returned in MatVec
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Adds '2' to the parameter if the diagonal is unit
+  auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific triangular matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_TRMV define.
+  auto fast_kernels = false;
+  auto status = MatVec(layout, a_transpose,
+                       n, n, static_cast<T>(1),
+                       a_buffer, a_offset, a_ld,
+                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+                       x_buffer, x_offset, x_inc,
+                       fast_kernels, fast_kernels,
+                       parameter, false, 0, 0);
+
+  // Returns the proper error code (renames vector Y to X)
+  switch(status) {
+    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
+    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
+    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+    default: return status;
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrmv<float>;
+template class Xtrmv<double>;
+template class Xtrmv<float2>;
+template class Xtrmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index 3d6fe595..6a066518 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -34,7 +34,7 @@ class TuneXgemv {
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/xgemv.opencl"
+      #include "../src/kernels/level2/xgemv.opencl"
     ;
   }
 
diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc
index 839ac3b1..85e18381 100644
--- a/test/correctness/testblas.cc
+++ b/test/correctness/testblas.cc
@@ -57,12 +57,14 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
   a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
   b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
   c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
+  ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
   dot_source_.resize(std::max(max_mat, max_matvec) + max_offset);
   PopulateVector(x_source_);
   PopulateVector(y_source_);
   PopulateVector(a_source_);
   PopulateVector(b_source_);
   PopulateVector(c_source_);
+  PopulateVector(ap_source_);
   PopulateVector(dot_source_);
 }
 
@@ -83,14 +85,16 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     auto a_mat1 = Buffer<T>(context_, args.a_size);
     auto b_mat1 = Buffer<T>(context_, args.b_size);
     auto c_mat1 = Buffer<T>(context_, args.c_size);
+    auto ap_mat1 = Buffer<T>(context_, args.ap_size);
     auto dot1 = Buffer<T>(context_, args.dot_size);
     x_vec1.Write(queue_, args.x_size, x_source_);
     y_vec1.Write(queue_, args.y_size, y_source_);
     a_mat1.Write(queue_, args.a_size, a_source_);
     b_mat1.Write(queue_, args.b_size, b_source_);
     c_mat1.Write(queue_, args.c_size, c_source_);
+    ap_mat1.Write(queue_, args.ap_size, ap_source_);
     dot1.Write(queue_, args.dot_size, dot_source_);
-    auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, dot1};
+    auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, dot1};
     auto status1 = run_reference_(args, buffers1, queue_);
 
     // Runs the CLBlast code
@@ -99,14 +103,16 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     auto a_mat2 = Buffer<T>(context_, args.a_size);
     auto b_mat2 = Buffer<T>(context_, args.b_size);
     auto c_mat2 = Buffer<T>(context_, args.c_size);
+    auto ap_mat2 = Buffer<T>(context_, args.ap_size);
     auto dot2 = Buffer<T>(context_, args.dot_size);
     x_vec2.Write(queue_, args.x_size, x_source_);
     y_vec2.Write(queue_, args.y_size, y_source_);
     a_mat2.Write(queue_, args.a_size, a_source_);
     b_mat2.Write(queue_, args.b_size, b_source_);
     c_mat2.Write(queue_, args.c_size, c_source_);
+    ap_mat2.Write(queue_, args.ap_size, ap_source_);
     dot2.Write(queue_, args.dot_size, dot_source_);
-    auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, dot2};
+    auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, dot2};
     auto status2 = run_routine_(args, buffers2, queue_);
 
     // Tests for equality of the two status codes
@@ -155,29 +161,33 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
     auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
     auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
     auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+    auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
     auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.dot_size*sizeof(T), nullptr,nullptr);
     auto x_vec1 = Buffer<T>(x1);
     auto y_vec1 = Buffer<T>(y1);
     auto a_mat1 = Buffer<T>(a1);
     auto b_mat1 = Buffer<T>(b1);
     auto c_mat1 = Buffer<T>(c1);
+    auto ap_mat1 = Buffer<T>(ap1);
     auto dot1 = Buffer<T>(d1);
     auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
     auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
     auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
     auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
     auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
+    auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
     auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.dot_size*sizeof(T), nullptr,nullptr);
     auto x_vec2 = Buffer<T>(x2);
     auto y_vec2 = Buffer<T>(y2);
     auto a_mat2 = Buffer<T>(a2);
     auto b_mat2 = Buffer<T>(b2);
     auto c_mat2 = Buffer<T>(c2);
+    auto ap_mat2 = Buffer<T>(ap2);
     auto dot2 = Buffer<T>(d2);
 
     // Runs the two routines
-    auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, dot1};
-    auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, dot2};
+    auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, dot1};
+    auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, dot2};
     auto status1 = run_reference_(args, buffers1, queue_);
     auto status2 = run_routine_(args, buffers2, queue_);
 
diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h
index 9e1d110c..bfd1763c 100644
--- a/test/correctness/testblas.h
+++ b/test/correctness/testblas.h
@@ -49,6 +49,7 @@ class TestBlas: public Tester<T,U> {
   const std::vector<size_t> kIncrements = { 1, 2, 7 };
   const std::vector<size_t> kMatrixDims = { 7, 64 };
   const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
+  const std::vector<size_t> kBandSizes = { 4, 19 };
   const std::vector<size_t> kOffsets = GetOffsets();
   const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_);
   const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_);
@@ -90,6 +91,7 @@ class TestBlas: public Tester<T,U> {
   std::vector<T> a_source_;
   std::vector<T> b_source_;
   std::vector<T> c_source_;
+  std::vector<T> ap_source_;
   std::vector<T> dot_source_;
   
   // The routine-specific functions passed to the tester
@@ -121,6 +123,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
   auto ms = std::vector<size_t>{args.m};
   auto ns = std::vector<size_t>{args.n};
   auto ks = std::vector<size_t>{args.k};
+  auto kus = std::vector<size_t>{args.ku};
+  auto kls = std::vector<size_t>{args.kl};
   auto layouts = std::vector<Layout>{args.layout};
   auto a_transposes = std::vector<Transpose>{args.a_transpose};
   auto b_transposes = std::vector<Transpose>{args.b_transpose};
@@ -137,6 +141,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
   auto a_offsets = std::vector<size_t>{args.a_offset};
   auto b_offsets = std::vector<size_t>{args.b_offset};
   auto c_offsets = std::vector<size_t>{args.c_offset};
+  auto ap_offsets = std::vector<size_t>{args.ap_offset};
   auto dot_offsets = std::vector<size_t>{args.dot_offset};
   auto alphas = std::vector<U>{args.alpha};
   auto betas = std::vector<U>{args.beta};
@@ -145,6 +150,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
   auto a_sizes = std::vector<size_t>{args.a_size};
   auto b_sizes = std::vector<size_t>{args.b_size};
   auto c_sizes = std::vector<size_t>{args.c_size};
+  auto ap_sizes = std::vector<size_t>{args.ap_size};
 
   // Sets the dimensions of the matrices or vectors depending on the BLAS level
   auto dimensions = (C::BLASLevel() == 3) ? tester.kMatrixDims :
@@ -156,6 +162,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
     if (option == kArgM) { ms = dimensions; }
     if (option == kArgN) { ns = dimensions; }
     if (option == kArgK) { ks = dimensions; }
+    if (option == kArgKU) { kus = tester.kBandSizes; }
+    if (option == kArgKL) { kls = tester.kBandSizes; }
     if (option == kArgLayout) { layouts = tester.kLayouts; }
     if (option == kArgATransp) { a_transposes = C::GetATransposes(tester.kTransposes); }
     if (option == kArgBTransp) { b_transposes = C::GetBTransposes(tester.kTransposes); }
@@ -172,6 +180,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
     if (option == kArgAOffset) { a_offsets = tester.kOffsets; }
     if (option == kArgBOffset) { b_offsets = tester.kOffsets; }
     if (option == kArgCOffset) { c_offsets = tester.kOffsets; }
+    if (option == kArgAPOffset) { ap_offsets = tester.kOffsets; }
     if (option == kArgDotOffset) { dot_offsets = tester.kOffsets; }
     if (option == kArgAlpha) { alphas = tester.kAlphaValues; }
     if (option == kArgBeta) { betas = tester.kBetaValues; }
@@ -181,6 +190,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
     if (option == kArgAOffset) { a_sizes = tester.kMatSizes; }
     if (option == kArgBOffset) { b_sizes = tester.kMatSizes; }
     if (option == kArgCOffset) { c_sizes = tester.kMatSizes; }
+    if (option == kArgAPOffset) { ap_sizes = tester.kMatSizes; }
   }
 
   // Loops over the test-cases from a data-layout point of view
@@ -197,21 +207,27 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
               for (auto &m: ms) { r_args.m = m;
                 for (auto &n: ns) { r_args.n = n;
                   for (auto &k: ks) { r_args.k = k;
-                    for (auto &x_inc: x_incs) { r_args.x_inc = x_inc;
-                      for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset;
-                        for (auto &y_inc: y_incs) { r_args.y_inc = y_inc;
-                          for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset;
-                            for (auto &a_ld: a_lds) { r_args.a_ld = a_ld;
-                              for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset;
-                                for (auto &b_ld: b_lds) { r_args.b_ld = b_ld;
-                                  for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset;
-                                    for (auto &c_ld: c_lds) { r_args.c_ld = c_ld;
-                                      for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset;
-                                        for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset;
-                                          for (auto &alpha: alphas) { r_args.alpha = alpha;
-                                            for (auto &beta: betas) { r_args.beta = beta;
-                                              C::SetSizes(r_args);
-                                              regular_test_vector.push_back(r_args);
+                    for (auto &ku: kus) { r_args.ku = ku;
+                      for (auto &kl: kls) { r_args.kl = kl;
+                        for (auto &x_inc: x_incs) { r_args.x_inc = x_inc;
+                          for (auto &x_offset: x_offsets) { r_args.x_offset = x_offset;
+                            for (auto &y_inc: y_incs) { r_args.y_inc = y_inc;
+                              for (auto &y_offset: y_offsets) { r_args.y_offset = y_offset;
+                                for (auto &a_ld: a_lds) { r_args.a_ld = a_ld;
+                                  for (auto &a_offset: a_offsets) { r_args.a_offset = a_offset;
+                                    for (auto &b_ld: b_lds) { r_args.b_ld = b_ld;
+                                      for (auto &b_offset: b_offsets) { r_args.b_offset = b_offset;
+                                        for (auto &c_ld: c_lds) { r_args.c_ld = c_ld;
+                                          for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset;
+                                            for (auto &ap_offset: ap_offsets) { r_args.ap_offset = ap_offset;
+                                              for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset;
+                                                for (auto &alpha: alphas) { r_args.alpha = alpha;
+                                                  for (auto &beta: betas) { r_args.beta = beta;
+                                                    C::SetSizes(r_args);
+                                                    regular_test_vector.push_back(r_args);
+                                                  }
+                                                }
+                                              }
                                             }
                                           }
                                         }
@@ -232,14 +248,16 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
               // Creates the arguments vector for the invalid-buffer tests
               auto invalid_test_vector = std::vector<Arguments<U>>{};
               auto i_args = args;
-              i_args.m = i_args.n = i_args.k = tester.kBufferSize;
+              i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
               i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
               for (auto &x_size: x_sizes) { i_args.x_size = x_size;
                 for (auto &y_size: y_sizes) { i_args.y_size = y_size;
                   for (auto &a_size: a_sizes) { i_args.a_size = a_size;
                     for (auto &b_size: b_sizes) { i_args.b_size = b_size;
                       for (auto &c_size: c_sizes) { i_args.c_size = c_size;
-                        invalid_test_vector.push_back(i_args);
+                        for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size;
+                          invalid_test_vector.push_back(i_args);
+                        }
                       }
                     }
                   }
diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc
index f792925e..350865f0 100644
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cc
@@ -132,6 +132,8 @@ void Tester<T,U>::TestEnd() {
       if (o == kArgM)        { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); }
       if (o == kArgN)        { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); }
       if (o == kArgK)        { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); }
+      if (o == kArgKU)       { fprintf(stdout, "%s=%lu ", kArgKU, entry.args.ku); }
+      if (o == kArgKL)       { fprintf(stdout, "%s=%lu ", kArgKL, entry.args.kl); }
       if (o == kArgLayout)   { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);}
       if (o == kArgATransp)  { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);}
       if (o == kArgBTransp)  { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
@@ -148,6 +150,7 @@ void Tester<T,U>::TestEnd() {
       if (o == kArgAOffset)  { fprintf(stdout, "%s=%lu ", kArgAOffset, entry.args.a_offset);}
       if (o == kArgBOffset)  { fprintf(stdout, "%s=%lu ", kArgBOffset, entry.args.b_offset);}
       if (o == kArgCOffset)  { fprintf(stdout, "%s=%lu ", kArgCOffset, entry.args.c_offset);}
+      if (o == kArgAPOffset) { fprintf(stdout, "%s=%lu ", kArgAPOffset, entry.args.ap_offset);}
       if (o == kArgDotOffset){ fprintf(stdout, "%s=%lu ", kArgDotOffset, entry.args.dot_offset);}
     }
     fprintf(stdout, "\n");
diff --git a/test/performance/client.cc b/test/performance/client.cc
index 9faa4dca..fb248854 100644
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@@ -48,9 +48,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
   for (auto &o: options_) {
 
     // Data-sizes
-    if (o == kArgM) { args.m  = GetArgument(argc, argv, help, kArgM, 512UL); }
-    if (o == kArgN) { args.n  = GetArgument(argc, argv, help, kArgN, 512UL); }
-    if (o == kArgK) { args.k  = GetArgument(argc, argv, help, kArgK, 512UL); }
+    if (o == kArgM)  { args.m   = GetArgument(argc, argv, help, kArgM, 512UL); }
+    if (o == kArgN)  { args.n   = GetArgument(argc, argv, help, kArgN, 512UL); }
+    if (o == kArgK)  { args.k   = GetArgument(argc, argv, help, kArgK, 512UL); }
+    if (o == kArgKU) { args.ku  = GetArgument(argc, argv, help, kArgKU, 128UL); }
+    if (o == kArgKL) { args.kl  = GetArgument(argc, argv, help, kArgKL, 128UL); }
 
     // Data-layouts
     if (o == kArgLayout)   { args.layout      = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@@ -73,6 +75,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
     if (o == kArgAOffset)  { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); }
     if (o == kArgBOffset)  { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); }
     if (o == kArgCOffset)  { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); }
+    if (o == kArgAPOffset) { args.ap_offset= GetArgument(argc, argv, help, kArgAPOffset, size_t{0}); }
 
     // Dot arguments
     if (o == kArgDotOffset)  { args.dot_offset = GetArgument(argc, argv, help, kArgDotOffset, size_t{0}); }
@@ -131,12 +134,14 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
     std::vector<T> a_source(args.a_size);
     std::vector<T> b_source(args.b_size);
     std::vector<T> c_source(args.c_size);
+    std::vector<T> ap_source(args.ap_size);
     std::vector<T> dot_source(args.dot_size);
     PopulateVector(x_source);
     PopulateVector(y_source);
     PopulateVector(a_source);
     PopulateVector(b_source);
     PopulateVector(c_source);
+    PopulateVector(ap_source);
     PopulateVector(dot_source);
 
     // Creates the matrices on the device
@@ -145,14 +150,16 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
     auto a_mat = Buffer<T>(context, args.a_size);
     auto b_mat = Buffer<T>(context, args.b_size);
     auto c_mat = Buffer<T>(context, args.c_size);
+    auto ap_mat = Buffer<T>(context, args.ap_size);
     auto dot = Buffer<T>(context, args.dot_size);
     x_vec.Write(queue, args.x_size, x_source);
     y_vec.Write(queue, args.y_size, y_source);
     a_mat.Write(queue, args.a_size, a_source);
     b_mat.Write(queue, args.b_size, b_source);
     c_mat.Write(queue, args.c_size, c_source);
+    ap_mat.Write(queue, args.ap_size, ap_source);
     dot.Write(queue, args.dot_size, dot_source);
-    auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, dot};
+    auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, dot};
 
     // Runs the routines and collects the timings
     auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
@@ -225,8 +232,10 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas
   auto integers = std::vector<size_t>{};
   for (auto &o: options_) {
     if      (o == kArgM) {        integers.push_back(args.m); }
-    if      (o == kArgN) {        integers.push_back(args.n); }
+    else if (o == kArgN) {        integers.push_back(args.n); }
     else if (o == kArgK) {        integers.push_back(args.k); }
+    else if (o == kArgKU) {       integers.push_back(args.ku); }
+    else if (o == kArgKL) {       integers.push_back(args.kl); }
     else if (o == kArgLayout) {   integers.push_back(static_cast<size_t>(args.layout)); }
     else if (o == kArgSide) {     integers.push_back(static_cast<size_t>(args.side)); }
     else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); }
@@ -243,6 +252,7 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas
     else if (o == kArgAOffset) {  integers.push_back(args.a_offset); }
     else if (o == kArgBOffset) {  integers.push_back(args.b_offset); }
     else if (o == kArgCOffset) {  integers.push_back(args.c_offset); }
+    else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); }
     else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); }
   }
   auto strings = std::vector<std::string>{};
diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h
new file mode 100644
index 00000000..0e238804
--- /dev/null
+++ b/test/routines/level2/xgbmv.h
@@ -0,0 +1,140 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xgbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XGBMV_H_
+#define CLBLAST_TEST_ROUTINES_XGBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXgbmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN, kArgKL, kArgKU,
+            kArgLayout, kArgATransp, 
+            kArgALeadDim, kArgXInc, kArgYInc,
+            kArgAOffset, kArgXOffset, kArgYOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    auto n_real = (a_transposed) ? args.m : args.n;
+    return n_real * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    auto m_real = (a_transposed) ? args.n : args.m;
+    return m_real * args.y_inc + args.y_offset;
+  }
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_two = (a_rotated) ? args.m : args.n;
+    return a_two * args.a_ld + args.a_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Gbmv(args.layout, args.a_transpose,
+                       args.m, args.n, args.kl, args.ku, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.m, args.n, args.kl, args.ku, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.Read(queue, args.y_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    return (a_transposed) ? args.n : args.m;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_one = (a_rotated) ? args.n : args.m;
+    auto a_two = (a_rotated) ? args.m : args.n;
+    return ((args.kl+args.ku+1)*a_two + 2*a_one + a_two) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XGBMV_H_
+#endif
diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h
new file mode 100644
index 00000000..34e1502f
--- /dev/null
+++ b/test/routines/level2/xhbmv.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xhbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XHBMV_H_
+#define CLBLAST_TEST_ROUTINES_XHBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXhbmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgKL,
+            kArgLayout, kArgTriangle,
+            kArgALeadDim, kArgXInc, kArgYInc,
+            kArgAOffset, kArgXOffset, kArgYOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    return args.n * args.y_inc + args.y_offset;
+  }
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return args.n * args.a_ld + args.a_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Hbmv(args.layout, args.triangle,
+                       args.n, args.kl, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasUplo>(args.triangle),
+                              args.n, args.kl, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.Read(queue, args.y_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XHBMV_H_
+#endif
diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h
new file mode 100644
index 00000000..8fd85b62
--- /dev/null
+++ b/test/routines/level2/xhpmv.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xhpmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XHPMV_H_
+#define CLBLAST_TEST_ROUTINES_XHPMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXhpmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgLayout, kArgTriangle,
+            kArgXInc, kArgYInc,
+            kArgAPOffset, kArgXOffset, kArgYOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    return args.n * args.y_inc + args.y_offset;
+  }
+  static size_t GetSizeAP(const Arguments<T> &args) {
+    return ((args.n*(args.n+1)) / 2) + args.ap_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.ap_size = GetSizeAP(args);
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Hpmv(args.layout, args.triangle,
+                       args.n, args.alpha,
+                       buffers.ap_mat(), args.ap_offset,
+                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasUplo>(args.triangle),
+                              args.n, args.alpha,
+                              buffers.ap_mat(), args.ap_offset,
+                              buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.Read(queue, args.y_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XHPMV_H_
+#endif
diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h
new file mode 100644
index 00000000..5bc17e49
--- /dev/null
+++ b/test/routines/level2/xsbmv.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xsbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSBMV_H_
+#define CLBLAST_TEST_ROUTINES_XSBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXsbmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgKL,
+            kArgLayout, kArgTriangle,
+            kArgALeadDim, kArgXInc, kArgYInc,
+            kArgAOffset, kArgXOffset, kArgYOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    return args.n * args.y_inc + args.y_offset;
+  }
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return args.n * args.a_ld + args.a_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Sbmv(args.layout, args.triangle,
+                       args.n, args.kl, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasUplo>(args.triangle),
+                              args.n, args.kl, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.Read(queue, args.y_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSBMV_H_
+#endif
diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h
new file mode 100644
index 00000000..e335da42
--- /dev/null
+++ b/test/routines/level2/xspmv.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xspmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSPMV_H_
+#define CLBLAST_TEST_ROUTINES_XSPMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXspmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgLayout, kArgTriangle,
+            kArgXInc, kArgYInc,
+            kArgAPOffset, kArgXOffset, kArgYOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    return args.n * args.y_inc + args.y_offset;
+  }
+  static size_t GetSizeAP(const Arguments<T> &args) {
+    return ((args.n*(args.n+1)) / 2) + args.ap_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.ap_size = GetSizeAP(args);
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Spmv(args.layout, args.triangle,
+                       args.n, args.alpha,
+                       buffers.ap_mat(), args.ap_offset,
+                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXspmv(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasUplo>(args.triangle),
+                              args.n, args.alpha,
+                              buffers.ap_mat(), args.ap_offset,
+                              buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.Read(queue, args.y_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSPMV_H_
+#endif
diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h
new file mode 100644
index 00000000..dbdddb65
--- /dev/null
+++ b/test/routines/level2/xtbmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTBMV_H_
+#define CLBLAST_TEST_ROUTINES_XTBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtbmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgKL,
+            kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgALeadDim, kArgXInc,
+            kArgAOffset, kArgXOffset};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return args.n * args.a_ld + args.a_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+                          args.n, args.kl,
+                          buffers.a_mat(), args.a_offset, args.a_ld,
+                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
+                                 static_cast<clblasUplo>(args.triangle),
+                                 static_cast<clblasTranspose>(args.a_transpose),
+                                 static_cast<clblasDiag>(args.diagonal),
+                                 args.n, args.kl,
+                                 buffers.a_mat(), args.a_offset, args.a_ld,
+                                 buffers.x_vec(), args.x_offset, args.x_inc,
+                                 1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.x_size, static_cast<T>(0));
+    buffers.x_vec.Read(queue, args.x_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.x_inc + args.x_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTBMV_H_
+#endif
diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h
new file mode 100644
index 00000000..4425765e
--- /dev/null
+++ b/test/routines/level2/xtpmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtpmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTPMV_H_
+#define CLBLAST_TEST_ROUTINES_XTPMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtpmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgXInc,
+            kArgAPOffset, kArgXOffset};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeAP(const Arguments<T> &args) {
+    return ((args.n*(args.n+1)) / 2) + args.ap_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.ap_size = GetSizeAP(args);
+    args.x_size = GetSizeX(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+                          args.n,
+                          buffers.ap_mat(), args.ap_offset,
+                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
+                                 static_cast<clblasUplo>(args.triangle),
+                                 static_cast<clblasTranspose>(args.a_transpose),
+                                 static_cast<clblasDiag>(args.diagonal),
+                                 args.n,
+                                 buffers.ap_mat(), args.ap_offset,
+                                 buffers.x_vec(), args.x_offset, args.x_inc,
+                                 1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.x_size, static_cast<T>(0));
+    buffers.x_vec.Read(queue, args.x_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.x_inc + args.x_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTPMV_H_
+#endif
diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h
new file mode 100644
index 00000000..1c0c6fd8
--- /dev/null
+++ b/test/routines/level2/xtrmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtrmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTRMV_H_
+#define CLBLAST_TEST_ROUTINES_XTRMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtrmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgALeadDim, kArgXInc,
+            kArgAOffset, kArgXOffset};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return args.n * args.a_ld + args.a_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+                          args.n,
+                          buffers.a_mat(), args.a_offset, args.a_ld,
+                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
+                                 static_cast<clblasUplo>(args.triangle),
+                                 static_cast<clblasTranspose>(args.a_transpose),
+                                 static_cast<clblasDiag>(args.diagonal),
+                                 args.n,
+                                 buffers.a_mat(), args.a_offset, args.a_ld,
+                                 buffers.x_vec(), args.x_offset, args.x_inc,
+                                 1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.x_size, static_cast<T>(0));
+    buffers.x_vec.Read(queue, args.x_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.x_inc + args.x_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.n*args.n + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTRMV_H_
+#endif
diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h
index 10c7dd47..23a02a45 100644
--- a/test/wrapper_clblas.h
+++ b/test/wrapper_clblas.h
@@ -238,7 +238,7 @@ clblasStatus clblasXdot<float>(const size_t n,
                                cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float>(context, n);
+  auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
   return clblasSdot(n,
                     dot_buffer, dot_offset,
                     x_buffer, x_offset, static_cast<int>(x_inc),
@@ -255,7 +255,7 @@ clblasStatus clblasXdot<double>(const size_t n,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double>(context, n);
+  auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
   return clblasDdot(n,
                     dot_buffer, dot_offset,
                     x_buffer, x_offset, static_cast<int>(x_inc),
@@ -281,7 +281,7 @@ clblasStatus clblasXdotu<float2>(const size_t n,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float2>(context, n);
+  auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
   return clblasCdotu(n,
                      dot_buffer, dot_offset,
                      x_buffer, x_offset, static_cast<int>(x_inc),
@@ -298,7 +298,7 @@ clblasStatus clblasXdotu<double2>(const size_t n,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double2>(context, n);
+  auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
   return clblasZdotu(n,
                      dot_buffer, dot_offset,
                      x_buffer, x_offset, static_cast<int>(x_inc),
@@ -324,7 +324,7 @@ clblasStatus clblasXdotc<float2>(const size_t n,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float2>(context, n);
+  auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
   return clblasCdotc(n,
                      dot_buffer, dot_offset,
                      x_buffer, x_offset, static_cast<int>(x_inc),
@@ -341,7 +341,7 @@ clblasStatus clblasXdotc<double2>(const size_t n,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double2>(context, n);
+  auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
   return clblasZdotc(n,
                      dot_buffer, dot_offset,
                      x_buffer, x_offset, static_cast<int>(x_inc),
@@ -747,7 +747,7 @@ clblasStatus clblasXtrmv<float>(const clblasOrder layout, const clblasUplo trian
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float>(context, n);
+  auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
   return clblasStrmv(layout, triangle, a_transpose, diagonal,
                      n,
                      a_buffer, a_offset, a_ld,
@@ -764,7 +764,7 @@ clblasStatus clblasXtrmv<double>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double>(context, n);
+  auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
   return clblasDtrmv(layout, triangle, a_transpose, diagonal,
                      n,
                      a_buffer, a_offset, a_ld,
@@ -781,7 +781,7 @@ clblasStatus clblasXtrmv<float2>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float2>(context, n);
+  auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
   return clblasCtrmv(layout, triangle, a_transpose, diagonal,
                      n,
                      a_buffer, a_offset, a_ld,
@@ -798,7 +798,7 @@ clblasStatus clblasXtrmv<double2>(const clblasOrder layout, const clblasUplo tri
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double2>(context, n);
+  auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
   return clblasZtrmv(layout, triangle, a_transpose, diagonal,
                      n,
                      a_buffer, a_offset, a_ld,
@@ -824,7 +824,7 @@ clblasStatus clblasXtbmv<float>(const clblasOrder layout, const clblasUplo trian
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float>(context, n);
+  auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
   return clblasStbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
                      a_buffer, a_offset, a_ld,
@@ -841,7 +841,7 @@ clblasStatus clblasXtbmv<double>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double>(context, n);
+  auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
   return clblasDtbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
                      a_buffer, a_offset, a_ld,
@@ -858,7 +858,7 @@ clblasStatus clblasXtbmv<float2>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float2>(context, n);
+  auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
   return clblasCtbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
                      a_buffer, a_offset, a_ld,
@@ -875,7 +875,7 @@ clblasStatus clblasXtbmv<double2>(const clblasOrder layout, const clblasUplo tri
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double2>(context, n);
+  auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
   return clblasZtbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
                      a_buffer, a_offset, a_ld,
@@ -901,7 +901,7 @@ clblasStatus clblasXtpmv<float>(const clblasOrder layout, const clblasUplo trian
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float>(context, n);
+  auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
   return clblasStpmv(layout, triangle, a_transpose, diagonal,
                      n,
                      ap_buffer, ap_offset,
@@ -918,7 +918,7 @@ clblasStatus clblasXtpmv<double>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double>(context, n);
+  auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
   return clblasDtpmv(layout, triangle, a_transpose, diagonal,
                      n,
                      ap_buffer, ap_offset,
@@ -935,7 +935,7 @@ clblasStatus clblasXtpmv<float2>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float2>(context, n);
+  auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
   return clblasCtpmv(layout, triangle, a_transpose, diagonal,
                      n,
                      ap_buffer, ap_offset,
@@ -952,7 +952,7 @@ clblasStatus clblasXtpmv<double2>(const clblasOrder layout, const clblasUplo tri
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double2>(context, n);
+  auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
   return clblasZtpmv(layout, triangle, a_transpose, diagonal,
                      n,
                      ap_buffer, ap_offset,
author	Cedric Nugteren <web@cedricnugteren.nl>	2015-09-26 17:02:34 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2015-09-26 17:02:34 +0200
commit	92b4b0d1feaaf92e160fa0342daf4269f24fb4d2 (patch)
tree	3356b67a281d8292e893028d74a1801554ce0ef2
parent	42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff)
parent	2b56c2c60325f02bc695cbb968049cc09205c713 (diff)