Merge pull request #27 from CNugteren/level2_matrix_vector

Added many level-2 matrix-vector routines
author: Cedric Nugteren <web@cedricnugteren.nl> 2015-09-26 17:02:34 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2015-09-26 17:02:34 +0200
commit: 92b4b0d1feaaf92e160fa0342daf4269f24fb4d2 (patch)
tree: 3356b67a281d8292e893028d74a1801554ce0ef2 /src
parent: 42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff)
parent: 2b56c2c60325f02bc695cbb968049cc09205c713 (diff)
15 files changed, 995 insertions, 198 deletions
diff --git a/src/clblast.cc b/src/clblast.cc
index a0dd8c70..77999aaf 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -28,8 +28,16 @@
 
 // BLAS level-2 includes
 #include "internal/routines/level2/xgemv.h"
+#include "internal/routines/level2/xgbmv.h"
 #include "internal/routines/level2/xhemv.h"
+#include "internal/routines/level2/xhbmv.h"
+#include "internal/routines/level2/xhpmv.h"
 #include "internal/routines/level2/xsymv.h"
+#include "internal/routines/level2/xsbmv.h"
+#include "internal/routines/level2/xspmv.h"
+#include "internal/routines/level2/xtrmv.h"
+#include "internal/routines/level2/xtbmv.h"
+#include "internal/routines/level2/xtpmv.h"
 
 // BLAS level-3 includes
 #include "internal/routines/level3/xgemm.h"
@@ -327,15 +335,26 @@ template StatusCode Gemv<double2>(const Layout, const Transpose,
 
 // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
 template <typename T>
-StatusCode Gbmv(const Layout, const Transpose,
-                const size_t, const size_t, const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                const T,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
+                const size_t m, const size_t n, const size_t kl, const size_t ku,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xgbmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoGbmv(layout, a_transpose,
+                        m, n, kl, ku,
+                        alpha,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        beta,
+                        Buffer<T>(y_buffer), y_offset, y_inc);
 }
 template StatusCode Gbmv<float>(const Layout, const Transpose,
                                 const size_t, const size_t, const size_t, const size_t,
@@ -412,15 +431,26 @@ template StatusCode Hemv<double2>(const Layout, const Triangle,
 
 // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
 template <typename T>
-StatusCode Hbmv(const Layout, const Triangle,
-                const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                const T,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Hbmv(const Layout layout, const Triangle triangle,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xhbmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoHbmv(layout, triangle,
+                        n, k,
+                        alpha,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        beta,
+                        Buffer<T>(y_buffer), y_offset, y_inc);
 }
 template StatusCode Hbmv<float2>(const Layout, const Triangle,
                                  const size_t, const size_t,
@@ -441,15 +471,26 @@ template StatusCode Hbmv<double2>(const Layout, const Triangle,
 
 // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
 template <typename T>
-StatusCode Hpmv(const Layout, const Triangle,
-                const size_t,
-                const T,
-                const cl_mem, const size_t,
-                const cl_mem, const size_t, const size_t,
-                const T,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Hpmv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xhpmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoHpmv(layout, triangle,
+                        n,
+                        alpha,
+                        Buffer<T>(ap_buffer), ap_offset,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        beta,
+                        Buffer<T>(y_buffer), y_offset, y_inc);
 }
 template StatusCode Hpmv<float2>(const Layout, const Triangle,
                                  const size_t,
@@ -510,15 +551,26 @@ template StatusCode Symv<double>(const Layout, const Triangle,
 
 // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
 template <typename T>
-StatusCode Sbmv(const Layout, const Triangle,
-                const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                const T,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Sbmv(const Layout layout, const Triangle triangle,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xsbmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoSbmv(layout, triangle,
+                        n, k,
+                        alpha,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        beta,
+                        Buffer<T>(y_buffer), y_offset, y_inc);
 }
 template StatusCode Sbmv<float>(const Layout, const Triangle,
                                 const size_t, const size_t,
@@ -539,15 +591,26 @@ template StatusCode Sbmv<double>(const Layout, const Triangle,
 
 // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
 template <typename T>
-StatusCode Spmv(const Layout, const Triangle,
-                const size_t,
-                const T,
-                const cl_mem, const size_t,
-                const cl_mem, const size_t, const size_t,
-                const T,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Spmv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xspmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoSpmv(layout, triangle,
+                        n,
+                        alpha,
+                        Buffer<T>(ap_buffer), ap_offset,
+                        Buffer<T>(x_buffer), x_offset, x_inc,
+                        beta,
+                        Buffer<T>(y_buffer), y_offset, y_inc);
 }
 template StatusCode Spmv<float>(const Layout, const Triangle,
                                 const size_t,
@@ -568,12 +631,20 @@ template StatusCode Spmv<double>(const Layout, const Triangle,
 
 // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
 template <typename T>
-StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtrmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoTrmv(layout, triangle, a_transpose, diagonal,
+                        n,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc);
 }
 template StatusCode Trmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                 const size_t,
@@ -598,12 +669,20 @@ template StatusCode Trmv<double2>(const Layout, const Triangle, const Transpose,
 
 // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
 template <typename T>
-StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t, const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n, const size_t k,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtbmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoTbmv(layout, triangle, a_transpose, diagonal,
+                        n, k,
+                        Buffer<T>(a_buffer), a_offset, a_ld,
+                        Buffer<T>(x_buffer), x_offset, x_inc);
 }
 template StatusCode Tbmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                 const size_t, const size_t,
@@ -628,12 +707,20 @@ template StatusCode Tbmv<double2>(const Layout, const Triangle, const Transpose,
 
 // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
 template <typename T>
-StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t,
-                const cl_mem, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto event_cpp = Event(*event);
+  auto routine = Xtpmv<T>(queue_cpp, event_cpp);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoTpmv(layout, triangle, a_transpose, diagonal,
+                        n,
+                        Buffer<T>(ap_buffer), ap_offset,
+                        Buffer<T>(x_buffer), x_offset, x_inc);
 }
 template StatusCode Tpmv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                 const size_t,
diff --git a/src/kernels/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 1e12dd78..8ed0e9e4 100644
--- a/src/kernels/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -79,22 +79,189 @@ R"(
 #endif
 
 // =================================================================================================
-// Defines how to load the input matrix in the regular case
 
-// Loads a scalar input value
+// Defines how to load the input matrix in the non-vectorized case
 inline real LoadMatrixA(const __global real* restrict agm, const int x, const int y,
-                        const int a_ld, const int a_offset) {
-  return agm[x + a_ld*y + a_offset];
+                        const int a_ld, const int a_offset, const int parameter,
+                        const int kl, const int ku) {
+  real result;
+
+  // For banded matrices
+  #if defined(ROUTINE_GBMV)
+    const int k = ku - y;
+    if (x >= y-ku && x < y+kl+1) { result = agm[a_ld*y + k + x + a_offset]; }
+    else { SetToZero(result); }
+
+  // For symmetric/hermitian matrices
+  #elif defined(ROUTINE_HEMV) || defined(ROUTINE_SYMV)
+    if ((parameter == 0 && y <= x) || (parameter == 1 && x <= y)) {
+      result = agm[a_ld*y + x + a_offset];
+      #if defined(ROUTINE_HEMV)
+        if (x == y) { result.y = ZERO; }
+      #endif
+    }
+    else {
+      result = agm[a_ld*x + y + a_offset];
+      #if defined(ROUTINE_HEMV)
+        COMPLEX_CONJUGATE(result);
+      #endif
+    }
+
+  // For triangular matrices
+  #elif defined(ROUTINE_TRMV)
+    if (((parameter == 0 || parameter == 2) && y <= x) ||
+        ((parameter == 1 || parameter == 3) && x <= y)) {
+      result = agm[a_ld*y + x + a_offset];
+      if (parameter >= 2 && y == x) {
+        SetToOne(result);
+      }
+    }
+    else {
+      SetToZero(result);
+    }
+
+  // For symmetric/hermitian banded matrices
+  #elif defined(ROUTINE_HBMV) || defined(ROUTINE_SBMV)
+    if (parameter == 1) {
+      if (x <= y) {
+        const int m = kl - y;
+        if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; }
+        else { SetToZero(result); }
+        #if defined(ROUTINE_HBMV)
+          if (x == y) { result.y = ZERO; }
+        #endif
+      }
+      else {
+        const int m = kl - x;
+        if (y >= x-kl && y <= x) { result = agm[a_ld*x + m + y + a_offset]; }
+        else { SetToZero(result); }
+        #if defined(ROUTINE_HBMV)
+          COMPLEX_CONJUGATE(result);
+        #endif
+      }
+    }
+    else {
+      if (x >= y) {
+        const int m = -y;
+        if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; }
+        else { SetToZero(result); }
+        #if defined(ROUTINE_HBMV)
+          if (x == y) { result.y = ZERO; }
+        #endif
+      }
+      else {
+        const int m = -x;
+        if (y >= x && y < x+kl+1) { result = agm[a_ld*x + m + y + a_offset]; }
+        else { SetToZero(result); }
+        #if defined(ROUTINE_HBMV)
+          COMPLEX_CONJUGATE(result);
+        #endif
+      }
+    }
+
+  // For triangular banded matrices
+  #elif defined(ROUTINE_TBMV)
+    if (parameter == 1 || parameter == 3) {
+      if (x <= y) {
+        const int m = kl - y;
+        if (x >= y-kl && x <= y) { result = agm[a_ld*y + m + x + a_offset]; }
+        else { SetToZero(result); }
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+    else {
+      if (x >= y) {
+        const int m = -y;
+        if (x >= y && x < y+kl+1) { result = agm[a_ld*y + m + x + a_offset]; }
+        else { SetToZero(result); }
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+
+  // For symmetric/hermitian packed matrices
+  #elif defined(ROUTINE_HPMV) || defined(ROUTINE_SPMV)
+    if (parameter == 1) {
+      if (x <= y) {
+        result = agm[((y+1)*y)/2 + x + a_offset];
+        #if defined(ROUTINE_HPMV)
+          if (x == y) { result.y = ZERO; }
+        #endif
+      }
+      else {
+        result = agm[((x+1)*x)/2 + y + a_offset];
+        #if defined(ROUTINE_HPMV)
+          COMPLEX_CONJUGATE(result);
+        #endif
+      }
+    }
+    else {
+      if (x >= y) {
+        result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset];
+        #if defined(ROUTINE_HPMV)
+          if (x == y) { result.y = ZERO; }
+        #endif
+      }
+      else {
+        result = agm[((2*a_ld-(x+1))*x)/2 + y + a_offset];
+        #if defined(ROUTINE_HPMV)
+          COMPLEX_CONJUGATE(result);
+        #endif
+      }
+    }
+
+  // For triangular packed matrices
+  #elif defined(ROUTINE_TPMV)
+    if (parameter == 1 || parameter == 3) {
+      if (x <= y) {
+        result = agm[((y+1)*y)/2 + x + a_offset];
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+    else {
+      if (x >= y) {
+        result = agm[((2*a_ld-(y+1))*y)/2 + x + a_offset];
+        if (parameter >= 2 && y == x) {
+          SetToOne(result);
+        }
+      }
+      else {
+        SetToZero(result);
+      }
+    }
+
+  // For general matrices
+  #else
+    result = agm[a_ld*y + x + a_offset];
+  #endif
+
+  return result;
 }
+
 // Loads a vector input value (1/2)
 inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
                             const int a_ld) {
-  return agm[x + a_ld*y];
+  return agm[a_ld*y + x];
 }
+
 // Loads a vector input value (2/2): as before, but different data-type
 inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
                               const int a_ld) {
-  return agm[x + a_ld*y];
+  return agm[a_ld*y + x];
 }
 
 // =================================================================================================
@@ -106,7 +273,8 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
                     const __global real* restrict agm, const int a_offset, const int a_ld,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc,
-                    const int do_conjugate) {
+                    const int do_conjugate, const int parameter,
+                    const int kl, const int ku) {
 
   // Local memory for the vector X
   __local real xlm[WGS1];
@@ -141,20 +309,20 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
         // The multiply-add function for the main part (divisable by WGS1)
         if (a_rotated == 0) { // Not rotated
           #pragma unroll
-          for (int kl=0; kl<WGS1; ++kl) {
-            const int k = kwg + kl;
-            real value = LoadMatrixA(agm, gid, k, a_ld, a_offset);
+          for (int kloop=0; kloop<WGS1; ++kloop) {
+            const int k = kwg + kloop;
+            real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
             if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-            MultiplyAdd(acc[w], xlm[kl], value);
+            MultiplyAdd(acc[w], xlm[kloop], value);
           }
         }
         else { // Transposed
           #pragma unroll
-          for (int kl=0; kl<WGS1; ++kl) {
-            const int k = kwg + kl;
-            real value = LoadMatrixA(agm, k, gid, a_ld, a_offset);
+          for (int kloop=0; kloop<WGS1; ++kloop) {
+            const int k = kwg + kloop;
+            real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
             if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-            MultiplyAdd(acc[w], xlm[kl], value);
+            MultiplyAdd(acc[w], xlm[kloop], value);
           }
         }
       }
@@ -174,7 +342,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
       if (a_rotated == 0) { // Not rotated
         #pragma unroll
         for (int k=n_floor; k<n; ++k) {
-          real value = LoadMatrixA(agm, gid, k, a_ld, a_offset);
+          real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
           if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
           MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value);
         }
@@ -182,7 +350,7 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
       else { // Transposed
         #pragma unroll
         for (int k=n_floor; k<n; ++k) {
-          real value = LoadMatrixA(agm, k, gid, a_ld, a_offset);
+          real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
           if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
           MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value);
         }
@@ -209,7 +377,8 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b
                         const __global realVF* restrict agm, const int a_offset, const int a_ld,
                         const __global real* restrict xgm, const int x_offset, const int x_inc,
                         __global real* ygm, const int y_offset, const int y_inc,
-                        const int do_conjugate) {
+                        const int do_conjugate, const int parameter,
+                        const int kl, const int ku) {
   // Local memory for the vector X
   __local real xlm[WGS2];
 
@@ -305,7 +474,8 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea
                            const __global realVFR* restrict agm, const int a_offset, const int a_ld,
                            const __global real* restrict xgm, const int x_offset, const int x_inc,
                            __global real* ygm, const int y_offset, const int y_inc,
-                           const int do_conjugate) {
+                           const int do_conjugate, const int parameter,
+                           const int kl, const int ku) {
   // Local memory for the vector X
   __local real xlm[WGS3];
 
diff --git a/src/routine.cc b/src/routine.cc
index 05a03683..2978c94a 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -191,6 +191,18 @@ StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buf
   return StatusCode::kSuccess;
 }
 
+// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size
+template <typename T>
+StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer,
+                                    const size_t offset, const size_t data_size) {
+  try {
+    auto required_size = (((n*(n+1))/2) + offset)*data_size;
+    auto buffer_size = buffer.GetSize();
+    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
+  } catch (...) { return StatusCode::kInvalidMatrixA; }
+  return StatusCode::kSuccess;
+}
+
 // =================================================================================================
 
 // Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc
new file mode 100644
index 00000000..14d391ca
--- /dev/null
+++ b/src/routines/level2/xgbmv.cc
@@ -0,0 +1,67 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xgbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n, const size_t kl, const size_t ku,
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // Reverses the upper and lower band count
+  auto rotated = (layout == Layout::kRowMajor);
+  auto kl_real = (rotated) ? ku : kl;
+  auto ku_real = (rotated) ? kl : ku;
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_GBMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, a_transpose,
+                m, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                0, false, kl_real, ku_real);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgbmv<float>;
+template class Xgbmv<double>;
+template class Xgbmv<float2>;
+template class Xgbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
index f95a9957..1b768dcd 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -32,8 +32,7 @@ template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
     Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
   source_string_ =
-    #include "../../kernels/pad.opencl" // For {Herm,Symm}{Upper,Lower}ToSquared (for HEMV/SYMV)
-    #include "../../kernels/xgemv.opencl"
+    #include "../../kernels/level2/xgemv.opencl"
   ;
 }
 
@@ -49,6 +48,31 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
                             const T beta,
                             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
 
+  // Performs the matrix-vector multiplication
+  return MatVec(layout, a_transpose,
+                m, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                true, true,
+                0, false, 0, 0); // N/A for this routine
+}
+
+// =================================================================================================
+
+// The generic implementation, also suited for other (non general) matrix-vector multiplications
+template <typename T>
+StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                            bool fast_kernel, bool fast_kernel_rot,
+                            const size_t parameter, const bool packed,
+                            const size_t kl, const size_t ku) {
+
   // Makes sure all dimensions are larger than zero
   if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
 
@@ -62,6 +86,11 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
   auto m_real = (a_transposed) ? n : m;
   auto n_real = (a_transposed) ? m : n;
 
+  // Special adjustments for banded matrices
+  if (kl != 0 || ku != 0) {
+    a_one = kl+ku+1;
+  }
+
   // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
   auto a_rotated = a_transposed ^ a_altlayout;
 
@@ -69,7 +98,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
   auto a_conjugate = (a_transpose == Transpose::kConjugate);
 
   // Tests the matrix and the vectors for validity
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = StatusCode::kSuccess;
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
+  else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); }
   if (ErrorIn(status)) { return status; }
   status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
   if (ErrorIn(status)) { return status; }
@@ -77,26 +108,26 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
   if (ErrorIn(status)) { return status; }
 
   // Determines whether or not the fast-version can be used
-  bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
-                         IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
-                         IsMultiple(n, db_["WGS2"]) &&
-                         IsMultiple(a_ld, db_["VW2"]);
-  bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
-                             IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
-                             IsMultiple(n, db_["WGS3"]) &&
-                             IsMultiple(a_ld, db_["VW3"]);
+  fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
+                IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
+                IsMultiple(n, db_["WGS2"]) &&
+                IsMultiple(a_ld, db_["VW2"]);
+  fast_kernel_rot = fast_kernel_rot && (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
+                    IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
+                    IsMultiple(n, db_["WGS3"]) &&
+                    IsMultiple(a_ld, db_["VW3"]);
 
   // If possible, run the fast-version (rotated or non-rotated) of the kernel
   auto kernel_name = "Xgemv";
   auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
   auto global_size = m_ceiled / db_["WPT1"];
   auto local_size = db_["WGS1"];
-  if (use_fast_kernel) {
+  if (fast_kernel) {
     kernel_name = "XgemvFast";
     global_size = m_real / db_["WPT2"];
     local_size = db_["WGS2"];
   }
-  if (use_fast_kernel_rot) {
+  if (fast_kernel_rot) {
     kernel_name = "XgemvFastRot";
     global_size = m_real / db_["WPT3"];
     local_size = db_["WGS3"];
@@ -123,6 +154,9 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
     kernel.SetArgument(12, static_cast<int>(y_offset));
     kernel.SetArgument(13, static_cast<int>(y_inc));
     kernel.SetArgument(14, static_cast<int>(a_conjugate));
+    kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
+    kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
+    kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
 
     // Launches the kernel
     auto global = std::vector<size_t>{global_size};
diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc
new file mode 100644
index 00000000..f59a7cb3
--- /dev/null
+++ b/src/routines/level2/xhbmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xhbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhbmv<T>::Xhbmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
+                            const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_HBMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, false, k, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhbmv<float2>;
+template class Xhbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc
index 2d92e45f..5a58b28b 100644
--- a/src/routines/level2/xhemv.cc
+++ b/src/routines/level2/xhemv.cc
@@ -37,57 +37,21 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
                             const T beta,
                             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
 
-  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
-
-  // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
-  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
-
-  // Temporary buffer for a copy of the hermitian matrix
-  try {
-    auto temp_herm = Buffer<T>(context_, n*n);
-
-    // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemv
-    // routine afterwards
-    try {
-      auto& program = GetProgramFromCache();
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the hermitian-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(n));
-      kernel.SetArgument(5, static_cast<int>(n));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_herm());
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // hermitian-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the regular Xgemv code
-      status = DoGemv(layout, Transpose::kNo, n, n, alpha,
-                      temp_herm, 0, n,
-                      x_buffer, x_offset, x_inc, beta,
-                      y_buffer, y_offset, y_inc);
-
-      // Return the status of the Xgemv routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_HEMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, false, 0, 0);
 }
 
 // =================================================================================================
diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc
new file mode 100644
index 00000000..2269255d
--- /dev/null
+++ b/src/routines/level2/xhpmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xhpmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhpmv<T>::Xhpmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
+                            const size_t n,
+                            const T alpha,
+                            const Buffer<T> &ap_buffer, const size_t ap_offset,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_HPMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                ap_buffer, ap_offset, n,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, true, 0, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhpmv<float2>;
+template class Xhpmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc
new file mode 100644
index 00000000..457bd762
--- /dev/null
+++ b/src/routines/level2/xsbmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xsbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsbmv<T>::Xsbmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
+                            const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_SBMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, false, k, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsbmv<float>;
+template class Xsbmv<double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc
new file mode 100644
index 00000000..4f1a9c61
--- /dev/null
+++ b/src/routines/level2/xspmv.cc
@@ -0,0 +1,64 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xspmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xspmv<T>::Xspmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
+                            const size_t n,
+                            const T alpha,
+                            const Buffer<T> &ap_buffer, const size_t ap_offset,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_SPMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                ap_buffer, ap_offset, n,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, true, 0, 0);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xspmv<float>;
+template class Xspmv<double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc
index 2ccb51f6..ec12324b 100644
--- a/src/routines/level2/xsymv.cc
+++ b/src/routines/level2/xsymv.cc
@@ -37,57 +37,21 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
                             const T beta,
                             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
 
-  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
-
-  // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines which kernel to run based on the layout (the Xgemv kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
-  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
-
-  // Temporary buffer for a copy of the symmetric matrix
-  try {
-    auto temp_symm = Buffer<T>(context_, n*n);
-
-    // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemv
-    // routine afterwards
-    try {
-      auto& program = GetProgramFromCache();
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the symmetric-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(n));
-      kernel.SetArgument(5, static_cast<int>(n));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_symm());
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // symmetric-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(n, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the regular Xgemv code
-      status = DoGemv(layout, Transpose::kNo, n, n, alpha,
-                      temp_symm, 0, n,
-                      x_buffer, x_offset, x_inc, beta,
-                      y_buffer, y_offset, y_inc);
-
-      // Return the status of the Xgemv routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific symmetric matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_SYMV define.
+  bool fast_kernels = false;
+  return MatVec(layout, Transpose::kNo,
+                n, n, alpha,
+                a_buffer, a_offset, a_ld,
+                x_buffer, x_offset, x_inc, beta,
+                y_buffer, y_offset, y_inc,
+                fast_kernels, fast_kernels,
+                is_upper, false, 0, 0);
 }
 
 // =================================================================================================
diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc
new file mode 100644
index 00000000..2e1aebff
--- /dev/null
+++ b/src/routines/level2/xtbmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtbmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtbmv<T>::Xtbmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n, const size_t k,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Creates a copy of X: a temporary scratch buffer
+  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+  try {
+    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+  } catch (...) { } // Continues: error-code is returned in MatVec
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Adds '2' to the parameter if the diagonal is unit
+  auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_TBMV define.
+  auto fast_kernels = false;
+  auto status = MatVec(layout, a_transpose,
+                       n, n, static_cast<T>(1),
+                       a_buffer, a_offset, a_ld,
+                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+                       x_buffer, x_offset, x_inc,
+                       fast_kernels, fast_kernels,
+                       parameter, false, k, 0);
+
+  // Returns the proper error code (renames vector Y to X)
+  switch(status) {
+    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
+    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
+    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+    default: return status;
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtbmv<float>;
+template class Xtbmv<double>;
+template class Xtbmv<float2>;
+template class Xtbmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc
new file mode 100644
index 00000000..aa0e099b
--- /dev/null
+++ b/src/routines/level2/xtpmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtpmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtpmv<T>::Xtpmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n,
+                            const Buffer<T> &ap_buffer, const size_t ap_offset,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Creates a copy of X: a temporary scratch buffer
+  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+  try {
+    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+  } catch (...) { } // Continues: error-code is returned in MatVec
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Adds '2' to the parameter if the diagonal is unit
+  auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_TPMV define.
+  auto fast_kernels = false;
+  auto status = MatVec(layout, a_transpose,
+                       n, n, static_cast<T>(1),
+                       ap_buffer, ap_offset, n,
+                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+                       x_buffer, x_offset, x_inc,
+                       fast_kernels, fast_kernels,
+                       parameter, true, 0, 0);
+
+  // Returns the proper error code (renames vector Y to X)
+  switch(status) {
+    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
+    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
+    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+    default: return status;
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtpmv<float>;
+template class Xtpmv<double>;
+template class Xtpmv<float2>;
+template class Xtpmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc
new file mode 100644
index 00000000..94424743
--- /dev/null
+++ b/src/routines/level2/xtrmv.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xtrmv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrmv<T>::Xtrmv(Queue &queue, Event &event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Creates a copy of X: a temporary scratch buffer
+  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+  try {
+    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+  } catch (...) { } // Continues: error-code is returned in MatVec
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Adds '2' to the parameter if the diagonal is unit
+  auto parameter = (diagonal == Diagonal::kUnit) ? is_upper + 2 : is_upper;
+
+  // Runs the generic matrix-vector multiplication, disabling the use of fast vectorized kernels.
+  // The specific triangular matrix-accesses are implemented in the kernel guarded by the
+  // ROUTINE_TRMV define.
+  auto fast_kernels = false;
+  auto status = MatVec(layout, a_transpose,
+                       n, n, static_cast<T>(1),
+                       a_buffer, a_offset, a_ld,
+                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+                       x_buffer, x_offset, x_inc,
+                       fast_kernels, fast_kernels,
+                       parameter, false, 0, 0);
+
+  // Returns the proper error code (renames vector Y to X)
+  switch(status) {
+    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
+    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
+    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
+    default: return status;
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrmv<float>;
+template class Xtrmv<double>;
+template class Xtrmv<float2>;
+template class Xtrmv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index 3d6fe595..6a066518 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -34,7 +34,7 @@ class TuneXgemv {
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/xgemv.opencl"
+      #include "../src/kernels/level2/xgemv.opencl"
     ;
   }
author	Cedric Nugteren <web@cedricnugteren.nl>	2015-09-26 17:02:34 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2015-09-26 17:02:34 +0200
commit	92b4b0d1feaaf92e160fa0342daf4269f24fb4d2 (patch)
tree	3356b67a281d8292e893028d74a1801554ce0ef2 /src
parent	42db8ea968d9d2972446aa4fd73515a3d7aa093e (diff)
parent	2b56c2c60325f02bc695cbb968049cc09205c713 (diff)