16 files changed, 915 insertions, 47 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d4c9837c..c1981d7a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -14,6 +14,9 @@ Development version (next release)
   * CHPMV/ZHPMV
   * SSBMV/DSBMV
   * SSPMV/DSPMV
+  * STRMV/DTRMV/CTRMV/ZTRMV
+  * STBMV/DTBMV/CTBMV/ZTBMV
+  * STPMV/DTPMV/CTPMV/ZTPMV
 
 Version 0.4.0
 - Now using the Claduc C++11 interface to OpenCL
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e670664..1ddd2f77 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,7 +106,7 @@ set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm)
 set(SAMPLE_PROGRAMS_CPP sgemm)
 set(SAMPLE_PROGRAMS_C sgemm)
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
-set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv)
+set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
 set(PRECISIONS 32 3232 64 6464)
diff --git a/README.md b/README.md
index 7d62c92f..8c7870a2 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ CLBlast: The tuned OpenCL BLAS library
 
 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
 
-__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
+__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
 
 
 Why CLBlast and not clBLAS or cuBLAS?
@@ -130,7 +130,7 @@ These graphs can be generated automatically on your own device. First, compile C
 Supported routines
 -------------
 
-CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
+CLBlast is in active development but already supports the majority of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
 
 | Level-1  | S | D | C | Z | Notes   |
 | ---------|---|---|---|---|---------|
@@ -160,9 +160,9 @@ CLBlast is in active development and currently does not support the full set of
 | xSYMV    | ✔ | ✔ | - | - |         |
 | xSBMV    | ✔ | ✔ | - | - |         |
 | xSPMV    | ✔ | ✔ | - | - |         |
-| xTRMV    |   |   |   |   |         |
-| xTBMV    |   |   |   |   |         |
-| xTPMV    |   |   |   |   |         |
+| xTRMV    | ✔ | ✔ | ✔ | ✔ |         |
+| xTBMV    | ✔ | ✔ | ✔ | ✔ |         |
+| xTPMV    | ✔ | ✔ | ✔ | ✔ |         |
 | xTRSV    |   |   |   |   |         |
 | xTBSV    |   |   |   |   |         |
 | xTPSV    |   |   |   |   |         |
diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h
new file mode 100644
index 00000000..89c90193
--- /dev/null
+++ b/include/internal/routines/level2/xtbmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTBMV_H_
+#define CLBLAST_ROUTINES_XTBMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtbmv: public Xgemv<T> {
+ public:
+  
+  // Members from the base class
+  using Routine<T>::queue_;
+  using Routine<T>::context_;
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTbmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n, const size_t k,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTBMV_H_
+#endif
diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h
new file mode 100644
index 00000000..183d3505
--- /dev/null
+++ b/include/internal/routines/level2/xtpmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTPMV_H_
+#define CLBLAST_ROUTINES_XTPMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtpmv: public Xgemv<T> {
+ public:
+  
+  // Members from the base class
+  using Routine<T>::queue_;
+  using Routine<T>::context_;
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTpmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTPMV_H_
+#endif
diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h
new file mode 100644
index 00000000..dadfbc98
--- /dev/null
+++ b/include/internal/routines/level2/xtrmv.h
@@ -0,0 +1,51 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMV_H_
+#define CLBLAST_ROUTINES_XTRMV_H_
+
+#include "internal/routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmv: public Xgemv<T> {
+ public:
+  
+  // Members from the base class
+  using Routine<T>::queue_;
+  using Routine<T>::context_;
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTrmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMV_H_
+#endif
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 338c8468..25f02861 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -71,9 +71,9 @@ routines = [
   Routine(True,  "2a", "symv",  T,  [S,D],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"),
   Routine(True,  "2a", "sbmv",  T,  [S,D],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"),
   Routine(True,  "2a", "spmv",  T,  [S,D],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"),
-  Routine(False, "2a", "trmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"),
-  Routine(False, "2a", "tbmv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"),
-  Routine(False, "2a", "tpmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"),
+  Routine(True,  "2a", "trmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"),
+  Routine(True,  "2a", "tbmv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"),
+  Routine(True,  "2a", "tpmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"),
   Routine(False, "2a", "trsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"),
   Routine(False, "2a", "tbsv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
   Routine(False, "2a", "tpsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
@@ -213,7 +213,7 @@ def wrapper_clblas(routines):
 			if routine.scratch:
 				result += "  auto queue = Queue(queues[0]);\n"
 				result += "  auto context = queue.GetContext();\n"
-				result += "  auto scratch_buffer = Buffer<"+flavour.template+">(context, n);\n"
+				result += "  auto scratch_buffer = Buffer<"+flavour.template+">(context, n*x_inc + x_offset);\n"
 				arguments += ["scratch_buffer()"]
 			result += "  return clblas"+flavour.name+routine.name+"("
 			result += (",\n"+indent).join([a for a in arguments])
@@ -237,7 +237,7 @@ files = [
   path_clblast+"/src/clblast_c.cc",
   path_clblast+"/test/wrapper_clblas.h",
 ]
-header_lines = [84, 49, 80, 24, 22]
+header_lines = [84, 52, 80, 24, 22]
 footer_lines = [6, 3, 5, 2, 6]
 
 # Checks whether the command-line arguments are valid; exists otherwise
diff --git a/src/clblast.cc b/src/clblast.cc
index f7baf5e8..77999aaf 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -35,6 +35,9 @@
 #include "internal/routines/level2/xsymv.h"
 #include "internal/routines/level2/xsbmv.h"
 #include "internal/routines/level2/xspmv.h"
+#include "internal/routines/level2/xtrmv.h"
+#include "internal/routines/level2/xtbmv.h"
+#include "internal/routines/level2/xtpmv.h"
 
 // BLAS level-3 includes
 #include "internal/routines/level3/xgemm.h"
@@ -628,12 +631,20 @@ template StatusCode Spmv<double>(const Layout, const Triangle,
 
 // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
 template <typename T>
-StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtrmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoTrmv(layout, triangle, a_transpose, diagonal,
+                        n,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc);
 }
 template StatusCode Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                 const size_t,
@@ -658,12 +669,20 @@ template StatusCode Trmv<double2>(const Layout, const Triangle, const Transpose,
 
 // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
 template <typename T>
-StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n, const size_t k,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtbmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoTbmv(layout, triangle, a_transpose, diagonal,
+                        n, k,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc);
 }
 template StatusCode Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                 const size_t, const size_t,
@@ -688,12 +707,20 @@ template StatusCode Tbmv<double2>(const Layout, const Triangle, const Transpose,
 
 // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
 template <typename T>
-StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t,
-                const cl_mem, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtpmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoTpmv(layout, triangle, a_transpose, diagonal,
+                        n,
+                        Buffer<T>(ap_buffer), ap_offset,
+                        Buffer<T>(x_buffer), x_offset, x_inc);
 }
 template StatusCode Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                 const size_t,
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index ab7802e5..8ed0e9e4 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -107,6 +107,19 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
       #endif
     }
 
+  // For triangular matrices
+  #elif defined(ROUTINE_TRMV)
+    if (((parameter == 0 || parameter == 2) && y <= x) ||
+        ((parameter == 1 || parameter == 3) && x <= y)) {
+      result = agm[a_ld*y + x + a_offset];
+      if (parameter >= 2 && y == x) {
+        SetToOne(result);
+      }
+    }
+    else {
+      SetToZero(result);
+    }
+
   // For symmetric/hermitian banded matrices
   #elif defined(ROUTINE_HBMV) || defined(ROUTINE_SBMV)
     if (parameter == 1) {
@@ -146,6 +159,35 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
       }
     }
 
+  // For triangular banded matrices
+  #elif defined(ROUTINE_TBMV)
+    if (parameter == 1 || parameter == 3) {
+      if (x <= y) {
+        const int m = kl - y;
+        if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; }
+        else { SetToZero(result); }
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+    else {
+      if (x >= y) {
+        const int m = -y;
+        if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; }
+        else { SetToZero(result); }
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+
   // For symmetric/hermitian packed matrices
   #elif defined(ROUTINE_HPMV) || defined(ROUTINE_SPMV)
     if (parameter == 1) {
@@ -177,6 +219,31 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
       }
     }
 
+  // For triangular packed matrices
+  #elif defined(ROUTINE_TPMV)
+    if (parameter == 1 || parameter == 3) {
+      if (x <= y) {
+        result = agm[((y+1)*y)/2 + x + a_offset];
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+    else {
+      if (x >= y) {
+        result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset];
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+
   // For general matrices
   #else
     result = agm[a_ld*y + x + a_offset];
diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc
new file mode 100644
index 00000000..2e1aebff
--- /dev/null
+++ b/src/routines/level2/xtbmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtbmv<T>::Xtbmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n, const size_t k,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Creates a copy of X: a temporary scratch buffer
+  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+  try {
+    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+  } catch (...) { } // Continues: error-code is returned in MatVec
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Adds '2' to the parameter if the diagonal is unit
+  auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_TBMV define.
+  auto fast_kernels = false;
+  auto status = MatVec(layout, a_transpose,
+                       n, n, static_cast<T>(1),
+                       a_buffer, a_offset, a_ld,
+                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+                       x_buffer, x_offset, x_inc,
+                       fast_kernels, fast_kernels,
+                       parameter, false, k, 0);
+
+  // Returns the proper error code (renames vector Y to X)
+  switch(status) {
+    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
+    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
+    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+    default: return status;
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtbmv<float>;
+template class Xtbmv<double>;
+template class Xtbmv<float2>;
+template class Xtbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc
new file mode 100644
index 00000000..aa0e099b
--- /dev/null
+++ b/src/routines/level2/xtpmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtpmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtpmv<T>::Xtpmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n,
+                            const Buffer<T> &ap_buffer, const size_t ap_offset,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Creates a copy of X: a temporary scratch buffer
+  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+  try {
+    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+  } catch (...) { } // Continues: error-code is returned in MatVec
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Adds '2' to the parameter if the diagonal is unit
+  auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_TPMV define.
+  auto fast_kernels = false;
+  auto status = MatVec(layout, a_transpose,
+                       n, n, static_cast<T>(1),
+                       ap_buffer, ap_offset, n,
+                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+                       x_buffer, x_offset, x_inc,
+                       fast_kernels, fast_kernels,
+                       parameter, true, 0, 0);
+
+  // Returns the proper error code (renames vector Y to X)
+  switch(status) {
+    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
+    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
+    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+    default: return status;
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtpmv<float>;
+template class Xtpmv<double>;
+template class Xtpmv<float2>;
+template class Xtpmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc
new file mode 100644
index 00000000..94424743
--- /dev/null
+++ b/src/routines/level2/xtrmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtrmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrmv<T>::Xtrmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Creates a copy of X: a temporary scratch buffer
+  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+  try {
+    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+  } catch (...) { } // Continues: error-code is returned in MatVec
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Adds '2' to the parameter if the diagonal is unit
+  auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific triangular matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_TRMV define.
+  auto fast_kernels = false;
+  auto status = MatVec(layout, a_transpose,
+                       n, n, static_cast<T>(1),
+                       a_buffer, a_offset, a_ld,
+                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+                       x_buffer, x_offset, x_inc,
+                       fast_kernels, fast_kernels,
+                       parameter, false, 0, 0);
+
+  // Returns the proper error code (renames vector Y to X)
+  switch(status) {
+    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
+    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
+    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+    default: return status;
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrmv<float>;
+template class Xtrmv<double>;
+template class Xtrmv<float2>;
+template class Xtrmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h
new file mode 100644
index 00000000..dbdddb65
--- /dev/null
+++ b/test/routines/level2/xtbmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtbmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTBMV_H_
+#define CLBLAST_TEST_ROUTINES_XTBMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtbmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgKL,
+            kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgALeadDim, kArgXInc,
+            kArgAOffset, kArgXOffset};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return args.n * args.a_ld + args.a_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+                          args.n, args.kl,
+                          buffers.a_mat(), args.a_offset, args.a_ld,
+                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
+                                 static_cast<clblasUplo>(args.triangle),
+                                 static_cast<clblasTranspose>(args.a_transpose),
+                                 static_cast<clblasDiag>(args.diagonal),
+                                 args.n, args.kl,
+                                 buffers.a_mat(), args.a_offset, args.a_ld,
+                                 buffers.x_vec(), args.x_offset, args.x_inc,
+                                 1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.x_size, static_cast<T>(0));
+    buffers.x_vec.Read(queue, args.x_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.x_inc + args.x_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return ((args.kl+args.kl+1)*args.n + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTBMV_H_
+#endif
diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h
new file mode 100644
index 00000000..4425765e
--- /dev/null
+++ b/test/routines/level2/xtpmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtpmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTPMV_H_
+#define CLBLAST_TEST_ROUTINES_XTPMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtpmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgXInc,
+            kArgAPOffset, kArgXOffset};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeAP(const Arguments<T> &args) {
+    return ((args.n*(args.n+1)) / 2) + args.ap_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.ap_size = GetSizeAP(args);
+    args.x_size = GetSizeX(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+                          args.n,
+                          buffers.ap_mat(), args.ap_offset,
+                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
+                                 static_cast<clblasUplo>(args.triangle),
+                                 static_cast<clblasTranspose>(args.a_transpose),
+                                 static_cast<clblasDiag>(args.diagonal),
+                                 args.n,
+                                 buffers.ap_mat(), args.ap_offset,
+                                 buffers.x_vec(), args.x_offset, args.x_inc,
+                                 1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.x_size, static_cast<T>(0));
+    buffers.x_vec.Read(queue, args.x_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.x_inc + args.x_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (((args.n*(args.n+1)) / 2) + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTPMV_H_
+#endif
diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h
new file mode 100644
index 00000000..1c0c6fd8
--- /dev/null
+++ b/test/routines/level2/xtrmv.h
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtrmv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTRMV_H_
+#define CLBLAST_TEST_ROUTINES_XTRMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtrmv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgALeadDim, kArgXInc,
+            kArgAOffset, kArgXOffset};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return args.n * args.a_ld + args.a_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+                          args.n,
+                          buffers.a_mat(), args.a_offset, args.a_ld,
+                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
+                                 static_cast<clblasUplo>(args.triangle),
+                                 static_cast<clblasTranspose>(args.a_transpose),
+                                 static_cast<clblasDiag>(args.diagonal),
+                                 args.n,
+                                 buffers.a_mat(), args.a_offset, args.a_ld,
+                                 buffers.x_vec(), args.x_offset, args.x_inc,
+                                 1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.x_size, static_cast<T>(0));
+    buffers.x_vec.Read(queue, args.x_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.x_inc + args.x_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.n*args.n + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTRMV_H_
+#endif
diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h
index 10c7dd47..23a02a45 100644
--- a/test/wrapper_clblas.h
+++ b/test/wrapper_clblas.h
@@ -238,7 +238,7 @@ clblasStatus clblasXdot<float>(const size_t n,
                                cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float>(context, n);
+  auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
   return clblasSdot(n,
                     dot_buffer, dot_offset,
                     x_buffer, x_offset, static_cast<int>(x_inc),
@@ -255,7 +255,7 @@ clblasStatus clblasXdot<double>(const size_t n,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double>(context, n);
+  auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
   return clblasDdot(n,
                     dot_buffer, dot_offset,
                     x_buffer, x_offset, static_cast<int>(x_inc),
@@ -281,7 +281,7 @@ clblasStatus clblasXdotu<float2>(const size_t n,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float2>(context, n);
+  auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
   return clblasCdotu(n,
                      dot_buffer, dot_offset,
                      x_buffer, x_offset, static_cast<int>(x_inc),
@@ -298,7 +298,7 @@ clblasStatus clblasXdotu<double2>(const size_t n,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double2>(context, n);
+  auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
   return clblasZdotu(n,
                      dot_buffer, dot_offset,
                      x_buffer, x_offset, static_cast<int>(x_inc),
@@ -324,7 +324,7 @@ clblasStatus clblasXdotc<float2>(const size_t n,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float2>(context, n);
+  auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
   return clblasCdotc(n,
                      dot_buffer, dot_offset,
                      x_buffer, x_offset, static_cast<int>(x_inc),
@@ -341,7 +341,7 @@ clblasStatus clblasXdotc<double2>(const size_t n,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double2>(context, n);
+  auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
   return clblasZdotc(n,
                      dot_buffer, dot_offset,
                      x_buffer, x_offset, static_cast<int>(x_inc),
@@ -747,7 +747,7 @@ clblasStatus clblasXtrmv<float>(const clblasOrder layout, const clblasUplo trian
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float>(context, n);
+  auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
   return clblasStrmv(layout, triangle, a_transpose, diagonal,
                      n,
                      a_buffer, a_offset, a_ld,
@@ -764,7 +764,7 @@ clblasStatus clblasXtrmv<double>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double>(context, n);
+  auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
   return clblasDtrmv(layout, triangle, a_transpose, diagonal,
                      n,
                      a_buffer, a_offset, a_ld,
@@ -781,7 +781,7 @@ clblasStatus clblasXtrmv<float2>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float2>(context, n);
+  auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
   return clblasCtrmv(layout, triangle, a_transpose, diagonal,
                      n,
                      a_buffer, a_offset, a_ld,
@@ -798,7 +798,7 @@ clblasStatus clblasXtrmv<double2>(const clblasOrder layout, const clblasUplo tri
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double2>(context, n);
+  auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
   return clblasZtrmv(layout, triangle, a_transpose, diagonal,
                      n,
                      a_buffer, a_offset, a_ld,
@@ -824,7 +824,7 @@ clblasStatus clblasXtbmv<float>(const clblasOrder layout, const clblasUplo trian
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float>(context, n);
+  auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
   return clblasStbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
                      a_buffer, a_offset, a_ld,
@@ -841,7 +841,7 @@ clblasStatus clblasXtbmv<double>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double>(context, n);
+  auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
   return clblasDtbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
                      a_buffer, a_offset, a_ld,
@@ -858,7 +858,7 @@ clblasStatus clblasXtbmv<float2>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float2>(context, n);
+  auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
   return clblasCtbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
                      a_buffer, a_offset, a_ld,
@@ -875,7 +875,7 @@ clblasStatus clblasXtbmv<double2>(const clblasOrder layout, const clblasUplo tri
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double2>(context, n);
+  auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
   return clblasZtbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
                      a_buffer, a_offset, a_ld,
@@ -901,7 +901,7 @@ clblasStatus clblasXtpmv<float>(const clblasOrder layout, const clblasUplo trian
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float>(context, n);
+  auto scratch_buffer = Buffer<float>(context, n*x_inc + x_offset);
   return clblasStpmv(layout, triangle, a_transpose, diagonal,
                      n,
                      ap_buffer, ap_offset,
@@ -918,7 +918,7 @@ clblasStatus clblasXtpmv<double>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double>(context, n);
+  auto scratch_buffer = Buffer<double>(context, n*x_inc + x_offset);
   return clblasDtpmv(layout, triangle, a_transpose, diagonal,
                      n,
                      ap_buffer, ap_offset,
@@ -935,7 +935,7 @@ clblasStatus clblasXtpmv<float2>(const clblasOrder layout, const clblasUplo tria
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<float2>(context, n);
+  auto scratch_buffer = Buffer<float2>(context, n*x_inc + x_offset);
   return clblasCtpmv(layout, triangle, a_transpose, diagonal,
                      n,
                      ap_buffer, ap_offset,
@@ -952,7 +952,7 @@ clblasStatus clblasXtpmv<double2>(const clblasOrder layout, const clblasUplo tri
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
-  auto scratch_buffer = Buffer<double2>(context, n);
+  auto scratch_buffer = Buffer<double2>(context, n*x_inc + x_offset);
   return clblasZtpmv(layout, triangle, a_transpose, diagonal,
                      n,
                      ap_buffer, ap_offset,