44 files changed, 1137 insertions, 91 deletions
diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cpp
index f90e26b2..ea4f001c 100644
--- a/src/routines/level2/xgbmv.cc
+++ b/src/routines/level2/xgbmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgbmv.h"
+#include "routines/level2/xgbmv.hpp"
 
 #include <string>
 #include <vector>
@@ -58,6 +58,7 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xgbmv<half>;
 template class Xgbmv<float>;
 template class Xgbmv<double>;
 template class Xgbmv<float2>;
diff --git a/src/routines/level2/xgbmv.hpp b/src/routines/level2/xgbmv.hpp
new file mode 100644
index 00000000..686ab642
--- /dev/null
+++ b/src/routines/level2/xgbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGBMV_H_
+#define CLBLAST_ROUTINES_XGBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgbmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n, const size_t kl, const size_t ku,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGBMV_H_
+#endif
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cpp
index f8985038..21fb397c 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
-template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
-template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
+    Routine(queue, event, name, {"Pad", "Xgemv"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level2/xgemv.opencl"
     #include "../../kernels/level2/xgemv_fast.opencl"
@@ -100,12 +92,12 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
 
   // Tests the matrix and the vectors for validity
   auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
-  else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); }
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
+  else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
   if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(n_real, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(m_real, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
   // Determines whether or not the fast-version can be used
@@ -134,16 +126,22 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
     local_size = db_["WGS3"];
   }
 
+  // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context_, 1);
+  auto beta_buffer = Buffer<T>(context_, 1);
+  alpha_buffer.Write(queue_, 1, &alpha);
+  beta_buffer.Write(queue_, 1, &beta);
+
   // Retrieves the Xgemv kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, kernel_name);
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(m_real));
     kernel.SetArgument(1, static_cast<int>(n_real));
-    kernel.SetArgument(2, alpha);
-    kernel.SetArgument(3, beta);
+    kernel.SetArgument(2, alpha_buffer());
+    kernel.SetArgument(3, beta_buffer());
     kernel.SetArgument(4, static_cast<int>(a_rotated));
     kernel.SetArgument(5, a_buffer());
     kernel.SetArgument(6, static_cast<int>(a_offset));
@@ -162,7 +160,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
     // Launches the kernel
     auto global = std::vector<size_t>{global_size};
     auto local = std::vector<size_t>{local_size};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -173,6 +171,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xgemv<half>;
 template class Xgemv<float>;
 template class Xgemv<double>;
 template class Xgemv<float2>;
diff --git a/src/routines/level2/xgemv.hpp b/src/routines/level2/xgemv.hpp
new file mode 100644
index 00000000..e9afec8d
--- /dev/null
+++ b/src/routines/level2/xgemv.hpp
@@ -0,0 +1,56 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemv routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGEMV_H_
+#define CLBLAST_ROUTINES_XGEMV_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgemv: public Routine {
+ public:
+
+  // Constructor
+  Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+
+  // Generic version used also for other matrix-vector multiplications
+  StatusCode MatVec(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    bool fast_kernel, bool fast_kernel_rot,
+                    const size_t parameter, const bool packed,
+                    const size_t kl, const size_t ku);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGEMV_H_
+#endif
diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cpp
index 686c7e60..353047d2 100644
--- a/src/routines/level2/xger.cc
+++ b/src/routines/level2/xger.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xger.h"
+#include "routines/level2/xger.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xger<float>::precision_ = Precision::kSingle;
-template <> const Precision Xger<double>::precision_ = Precision::kDouble;
-template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xger<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level2/level2.opencl"
     #include "../../kernels/level2/xger.opencl"
@@ -57,22 +49,26 @@ StatusCode Xger<T>::DoGer(const Layout layout,
   const auto a_two = (a_is_rowmajor) ? m : n;
 
   // Tests the matrix and the vectors for validity
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorX(m, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(m, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
-  // Retrieves the Xgemv kernel from the compiled binary
+  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context_, 1);
+  alpha_buffer.Write(queue_, 1, &alpha);
+
+  // Retrieves the kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, "Xger");
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(a_one));
     kernel.SetArgument(1, static_cast<int>(a_two));
-    kernel.SetArgument(2, alpha);
+    kernel.SetArgument(2, alpha_buffer());
     kernel.SetArgument(3, x_buffer());
     kernel.SetArgument(4, static_cast<int>(x_offset));
     kernel.SetArgument(5, static_cast<int>(x_inc));
@@ -89,7 +85,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
     auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
     auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
     auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -100,6 +96,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xger<half>;
 template class Xger<float>;
 template class Xger<double>;
 template class Xger<float2>;
diff --git a/src/routines/level2/xger.hpp b/src/routines/level2/xger.hpp
new file mode 100644
index 00000000..3c6abe44
--- /dev/null
+++ b/src/routines/level2/xger.hpp
@@ -0,0 +1,43 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xger routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGER_H_
+#define CLBLAST_ROUTINES_XGER_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xger: public Routine {
+ public:
+
+  // Constructor
+  Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGer(const Layout layout,
+                   const size_t m, const size_t n,
+                   const T alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGER_H_
+#endif
diff --git a/src/routines/level2/xgerc.cc b/src/routines/level2/xgerc.cpp
index 73284b52..d9feda97 100644
--- a/src/routines/level2/xgerc.cc
+++ b/src/routines/level2/xgerc.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgerc.h"
+#include "routines/level2/xgerc.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xgerc.hpp b/src/routines/level2/xgerc.hpp
new file mode 100644
index 00000000..f1d04dfd
--- /dev/null
+++ b/src/routines/level2/xgerc.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgerc routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGERC_H_
+#define CLBLAST_ROUTINES_XGERC_H_
+
+#include "routines/level2/xger.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgerc: public Xger<T> {
+ public:
+
+  // Uses the regular Xger routine
+  using Xger<T>::DoGer;
+
+  // Constructor
+  Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGerc(const Layout layout,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGERC_H_
+#endif
diff --git a/src/routines/level2/xgeru.cc b/src/routines/level2/xgeru.cpp
index 7730d6a5..da9e91c2 100644
--- a/src/routines/level2/xgeru.cc
+++ b/src/routines/level2/xgeru.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgeru.h"
+#include "routines/level2/xgeru.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xgeru.hpp b/src/routines/level2/xgeru.hpp
new file mode 100644
index 00000000..fb50e917
--- /dev/null
+++ b/src/routines/level2/xgeru.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgeru routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGERU_H_
+#define CLBLAST_ROUTINES_XGERU_H_
+
+#include "routines/level2/xger.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgeru: public Xger<T> {
+ public:
+
+  // Uses the regular Xger routine
+  using Xger<T>::DoGer;
+
+  // Constructor
+  Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGeru(const Layout layout,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGERU_H_
+#endif
diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cpp
index 58591b50..f6c0e3c4 100644
--- a/src/routines/level2/xhbmv.cc
+++ b/src/routines/level2/xhbmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhbmv.h"
+#include "routines/level2/xhbmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xhbmv.hpp b/src/routines/level2/xhbmv.hpp
new file mode 100644
index 00000000..d668eb88
--- /dev/null
+++ b/src/routines/level2/xhbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHBMV_H_
+#define CLBLAST_ROUTINES_XHBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhbmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHbmv(const Layout layout, const Triangle triangle,
+                    const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHBMV_H_
+#endif
diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cpp
index b4ef0fa4..2cbcf7b4 100644
--- a/src/routines/level2/xhemv.cc
+++ b/src/routines/level2/xhemv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhemv.h"
+#include "routines/level2/xhemv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xhemv.hpp b/src/routines/level2/xhemv.hpp
new file mode 100644
index 00000000..8e062fd3
--- /dev/null
+++ b/src/routines/level2/xhemv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHEMV_H_
+#define CLBLAST_ROUTINES_XHEMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhemv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHemv(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHEMV_H_
+#endif
diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cpp
index a7116213..ed8ba9e9 100644
--- a/src/routines/level2/xher.cc
+++ b/src/routines/level2/xher.cpp
@@ -11,25 +11,17 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xher.h"
+#include "routines/level2/xher.hpp"
 
 #include <string>
 
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
-template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
-template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xher<double2, double>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level2/level2.opencl"
     #include "../../kernels/level2/xher.opencl"
@@ -43,6 +35,7 @@ template <> float2 Xher<float2,float>::GetAlpha(const float alpha) { return floa
 template <> double2 Xher<double2,double>::GetAlpha(const double alpha) { return double2{alpha, 0.0}; }
 template <> float Xher<float,float>::GetAlpha(const float alpha) { return alpha; }
 template <> double Xher<double,double>::GetAlpha(const double alpha) { return alpha; }
+template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; }
 
 // =================================================================================================
 
@@ -63,28 +56,32 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
                          (triangle == Triangle::kLower && layout == Layout::kRowMajor));
   const auto is_rowmajor = (layout == Layout::kRowMajor);
 
-  // Creates a matching version of alpha
-  const auto matching_alpha = GetAlpha(alpha);
-
   // Tests the matrix and the vectors for validity
   auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
-  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
+  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
   if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
 
   // If alpha is zero an update is not required
   if (alpha == U{0}) { return StatusCode::kSuccess; }
 
-  // Retrieves the Xgemv kernel from the compiled binary
+  // Creates a matching version of alpha
+  const auto matching_alpha = GetAlpha(alpha);
+
+  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context_, 1);
+  alpha_buffer.Write(queue_, 1, &matching_alpha);
+
+  // Retrieves the kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, "Xher");
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, matching_alpha);
+    kernel.SetArgument(1, alpha_buffer());
     kernel.SetArgument(2, x_buffer());
     kernel.SetArgument(3, static_cast<int>(x_offset));
     kernel.SetArgument(4, static_cast<int>(x_inc));
@@ -99,7 +96,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
     auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
     auto global = std::vector<size_t>{global_one, global_two};
     auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -110,6 +107,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xher<half, half>;
 template class Xher<float, float>;
 template class Xher<double, double>;
 template class Xher<float2, float>;
diff --git a/src/routines/level2/xher.hpp b/src/routines/level2/xher.hpp
new file mode 100644
index 00000000..9ff6bf3f
--- /dev/null
+++ b/src/routines/level2/xher.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER_H_
+#define CLBLAST_ROUTINES_XHER_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xher: public Routine {
+ public:
+
+  // Constructor
+  Xher(Queue &queue, EventPointer event, const std::string &name = "HER");
+
+  // Translates alpha of type 'U' into type 'T'
+  T GetAlpha(const U alpha);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHer(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const U alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                   const bool packed = false);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER_H_
+#endif
diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cpp
index 3fd1a961..50572cea 100644
--- a/src/routines/level2/xher2.cc
+++ b/src/routines/level2/xher2.cpp
@@ -11,25 +11,17 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xher2.h"
+#include "routines/level2/xher2.hpp"
 
 #include <string>
 
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
-template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
-template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xher2<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level2/level2.opencl"
     #include "../../kernels/level2/xher2.opencl"
@@ -58,22 +50,26 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
 
   // Tests the matrix and the vectors for validity
   auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
-  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
+  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
   if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
-  // Retrieves the Xgemv kernel from the compiled binary
+  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context_, 1);
+  alpha_buffer.Write(queue_, 1, &alpha);
+
+  // Retrieves the kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, "Xher2");
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, alpha);
+    kernel.SetArgument(1, alpha_buffer());
     kernel.SetArgument(2, x_buffer());
     kernel.SetArgument(3, static_cast<int>(x_offset));
     kernel.SetArgument(4, static_cast<int>(x_inc));
@@ -91,7 +87,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
     auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
     auto global = std::vector<size_t>{global_one, global_two};
     auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -102,6 +98,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xher2<half>;
 template class Xher2<float>;
 template class Xher2<double>;
 template class Xher2<float2>;
diff --git a/src/routines/level2/xher2.hpp b/src/routines/level2/xher2.hpp
new file mode 100644
index 00000000..8c53c047
--- /dev/null
+++ b/src/routines/level2/xher2.hpp
@@ -0,0 +1,44 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER2_H_
+#define CLBLAST_ROUTINES_XHER2_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xher2: public Routine {
+ public:
+
+  // Constructor
+  Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHer2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const bool packed = false);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER2_H_
+#endif
diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cpp
index 92686dbe..e6f82b34 100644
--- a/src/routines/level2/xhpmv.cc
+++ b/src/routines/level2/xhpmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhpmv.h"
+#include "routines/level2/xhpmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xhpmv.hpp b/src/routines/level2/xhpmv.hpp
new file mode 100644
index 00000000..b11192f9
--- /dev/null
+++ b/src/routines/level2/xhpmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPMV_H_
+#define CLBLAST_ROUTINES_XHPMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhpmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHpmv(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPMV_H_
+#endif
diff --git a/src/routines/level2/xhpr.cc b/src/routines/level2/xhpr.cpp
index 4b31ad09..225ebfe5 100644
--- a/src/routines/level2/xhpr.cc
+++ b/src/routines/level2/xhpr.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhpr.h"
+#include "routines/level2/xhpr.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xhpr.hpp b/src/routines/level2/xhpr.hpp
new file mode 100644
index 00000000..37801c68
--- /dev/null
+++ b/src/routines/level2/xhpr.hpp
@@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPR_H_
+#define CLBLAST_ROUTINES_XHPR_H_
+
+#include "routines/level2/xher.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xhpr: public Xher<T,U> {
+ public:
+
+  // Uses the regular Xher routine
+  using Xher<T,U>::DoHer;
+
+  // Constructor
+  Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHpr(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const U alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPR_H_
+#endif
diff --git a/src/routines/level2/xhpr2.cc b/src/routines/level2/xhpr2.cpp
index 9be24f43..85f9d3f9 100644
--- a/src/routines/level2/xhpr2.cc
+++ b/src/routines/level2/xhpr2.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhpr2.h"
+#include "routines/level2/xhpr2.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xhpr2.hpp b/src/routines/level2/xhpr2.hpp
new file mode 100644
index 00000000..d66dce55
--- /dev/null
+++ b/src/routines/level2/xhpr2.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPR2_H_
+#define CLBLAST_ROUTINES_XHPR2_H_
+
+#include "routines/level2/xher2.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhpr2: public Xher2<T> {
+ public:
+
+  // Uses the regular Xher2 routine
+  using Xher2<T>::DoHer2;
+
+  // Constructor
+  Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHpr2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPR2_H_
+#endif
diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cpp
index bc82c88d..28730899 100644
--- a/src/routines/level2/xsbmv.cc
+++ b/src/routines/level2/xsbmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsbmv.h"
+#include "routines/level2/xsbmv.hpp"
 
 #include <string>
 #include <vector>
@@ -57,6 +57,7 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xsbmv<half>;
 template class Xsbmv<float>;
 template class Xsbmv<double>;
 
diff --git a/src/routines/level2/xsbmv.hpp b/src/routines/level2/xsbmv.hpp
new file mode 100644
index 00000000..16c5e9a8
--- /dev/null
+++ b/src/routines/level2/xsbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xsbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSBMV_H_
+#define CLBLAST_ROUTINES_XSBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsbmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSbmv(const Layout layout, const Triangle triangle,
+                    const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSBMV_H_
+#endif
diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cpp
index 6e00dcfa..f6651012 100644
--- a/src/routines/level2/xspmv.cc
+++ b/src/routines/level2/xspmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xspmv.h"
+#include "routines/level2/xspmv.hpp"
 
 #include <string>
 #include <vector>
@@ -57,6 +57,7 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xspmv<half>;
 template class Xspmv<float>;
 template class Xspmv<double>;
 
diff --git a/src/routines/level2/xspmv.hpp b/src/routines/level2/xspmv.hpp
new file mode 100644
index 00000000..a0c69b85
--- /dev/null
+++ b/src/routines/level2/xspmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xspmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPMV_H_
+#define CLBLAST_ROUTINES_XSPMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSpmv(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPMV_H_
+#endif
diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cpp
index 55af2f29..a75fe9c3 100644
--- a/src/routines/level2/xspr.cc
+++ b/src/routines/level2/xspr.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xspr.h"
+#include "routines/level2/xspr.hpp"
 
 #include <string>
 
@@ -44,6 +44,7 @@ StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xspr<half>;
 template class Xspr<float>;
 template class Xspr<double>;
 
diff --git a/src/routines/level2/xspr.hpp b/src/routines/level2/xspr.hpp
new file mode 100644
index 00000000..6468c736
--- /dev/null
+++ b/src/routines/level2/xspr.hpp
@@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPR_H_
+#define CLBLAST_ROUTINES_XSPR_H_
+
+#include "routines/level2/xher.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspr: public Xher<T,T> {
+ public:
+
+  // Uses the regular Xher routine
+  using Xher<T,T>::DoHer;
+
+  // Constructor
+  Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSpr(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const T alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPR_H_
+#endif
diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cpp
index 9a3f97ce..c39a2eb4 100644
--- a/src/routines/level2/xspr2.cc
+++ b/src/routines/level2/xspr2.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xspr2.h"
+#include "routines/level2/xspr2.hpp"
 
 #include <string>
 
@@ -46,6 +46,7 @@ StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xspr2<half>;
 template class Xspr2<float>;
 template class Xspr2<double>;
 
diff --git a/src/routines/level2/xspr2.hpp b/src/routines/level2/xspr2.hpp
new file mode 100644
index 00000000..693c56a1
--- /dev/null
+++ b/src/routines/level2/xspr2.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPR2_H_
+#define CLBLAST_ROUTINES_XSPR2_H_
+
+#include "routines/level2/xher2.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspr2: public Xher2<T> {
+ public:
+
+  // Uses the regular Xher2 routine
+  using Xher2<T>::DoHer2;
+
+  // Constructor
+  Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSpr2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPR2_H_
+#endif
diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cpp
index a9eb284f..648d2a3e 100644
--- a/src/routines/level2/xsymv.cc
+++ b/src/routines/level2/xsymv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsymv.h"
+#include "routines/level2/xsymv.hpp"
 
 #include <string>
 #include <vector>
@@ -57,6 +57,7 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xsymv<half>;
 template class Xsymv<float>;
 template class Xsymv<double>;
 
diff --git a/src/routines/level2/xsymv.hpp b/src/routines/level2/xsymv.hpp
new file mode 100644
index 00000000..67815f2f
--- /dev/null
+++ b/src/routines/level2/xsymv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xsymv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYMV_H_
+#define CLBLAST_ROUTINES_XSYMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsymv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSymv(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYMV_H_
+#endif
diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cpp
index 4b3928e5..758d8f8f 100644
--- a/src/routines/level2/xsyr.cc
+++ b/src/routines/level2/xsyr.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsyr.h"
+#include "routines/level2/xsyr.hpp"
 
 #include <string>
 
@@ -43,6 +43,7 @@ StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xsyr<half>;
 template class Xsyr<float>;
 template class Xsyr<double>;
 
diff --git a/src/routines/level2/xsyr.hpp b/src/routines/level2/xsyr.hpp
new file mode 100644
index 00000000..20393454
--- /dev/null
+++ b/src/routines/level2/xsyr.hpp
@@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR_H_
+#define CLBLAST_ROUTINES_XSYR_H_
+
+#include "routines/level2/xher.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr: public Xher<T,T> {
+ public:
+
+  // Uses the regular Xher routine
+  using Xher<T,T>::DoHer;
+
+  // Constructor
+  Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyr(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const T alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR_H_
+#endif
diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cpp
index 3ae389e0..6f43b219 100644
--- a/src/routines/level2/xsyr2.cc
+++ b/src/routines/level2/xsyr2.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsyr2.h"
+#include "routines/level2/xsyr2.hpp"
 
 #include <string>
 
@@ -45,6 +45,7 @@ StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xsyr2<half>;
 template class Xsyr2<float>;
 template class Xsyr2<double>;
 
diff --git a/src/routines/level2/xsyr2.hpp b/src/routines/level2/xsyr2.hpp
new file mode 100644
index 00000000..1a8dcbe8
--- /dev/null
+++ b/src/routines/level2/xsyr2.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR2_H_
+#define CLBLAST_ROUTINES_XSYR2_H_
+
+#include "routines/level2/xher2.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr2: public Xher2<T> {
+ public:
+
+  // Uses the regular Xher2 routine
+  using Xher2<T>::DoHer2;
+
+  // Constructor
+  Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyr2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR2_H_
+#endif
diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cpp
index 47371c87..e315c544 100644
--- a/src/routines/level2/xtbmv.cc
+++ b/src/routines/level2/xtbmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xtbmv.h"
+#include "routines/level2/xtbmv.hpp"
 
 #include <string>
 #include <vector>
@@ -72,6 +72,7 @@ StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xtbmv<half>;
 template class Xtbmv<float>;
 template class Xtbmv<double>;
 template class Xtbmv<float2>;
diff --git a/src/routines/level2/xtbmv.hpp b/src/routines/level2/xtbmv.hpp
new file mode 100644
index 00000000..389e9705
--- /dev/null
+++ b/src/routines/level2/xtbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTBMV_H_
+#define CLBLAST_ROUTINES_XTBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtbmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTbmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n, const size_t k,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTBMV_H_
+#endif
diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cpp
index c63cb9b2..46811089 100644
--- a/src/routines/level2/xtpmv.cc
+++ b/src/routines/level2/xtpmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xtpmv.h"
+#include "routines/level2/xtpmv.hpp"
 
 #include <string>
 #include <vector>
@@ -72,6 +72,7 @@ StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xtpmv<half>;
 template class Xtpmv<float>;
 template class Xtpmv<double>;
 template class Xtpmv<float2>;
diff --git a/src/routines/level2/xtpmv.hpp b/src/routines/level2/xtpmv.hpp
new file mode 100644
index 00000000..0e8cf1d2
--- /dev/null
+++ b/src/routines/level2/xtpmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTPMV_H_
+#define CLBLAST_ROUTINES_XTPMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtpmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTpmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTPMV_H_
+#endif
diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cpp
index 9111d41d..d2f24252 100644
--- a/src/routines/level2/xtrmv.cc
+++ b/src/routines/level2/xtrmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xtrmv.h"
+#include "routines/level2/xtrmv.hpp"
 
 #include <string>
 #include <vector>
@@ -72,6 +72,7 @@ StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xtrmv<half>;
 template class Xtrmv<float>;
 template class Xtrmv<double>;
 template class Xtrmv<float2>;
diff --git a/src/routines/level2/xtrmv.hpp b/src/routines/level2/xtrmv.hpp
new file mode 100644
index 00000000..07dd7841
--- /dev/null
+++ b/src/routines/level2/xtrmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMV_H_
+#define CLBLAST_ROUTINES_XTRMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTrmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMV_H_
+#endif