87 files changed, 2252 insertions, 42 deletions
diff --git a/src/routines/common.cc b/src/routines/common.cc
index 561a1bd8..c378df28 100644
--- a/src/routines/common.cc
+++ b/src/routines/common.cc
@@ -13,7 +13,7 @@
 
 #include <vector>
 
-#include "internal/routines/common.h"
+#include "routines/common.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
new file mode 100644
index 00000000..c99cd39d
--- /dev/null
+++ b/src/routines/common.hpp
@@ -0,0 +1,173 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the interfaces to common kernels, such as copying, padding, and
+// transposing a matrix. These functions are templated and thus header-only. This file also contains
+// other common functions to routines, such as a function to launch a kernel.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_COMMON_H_
+#define CLBLAST_ROUTINES_COMMON_H_
+
+#include <string>
+#include <vector>
+
+#include "clblast.h"
+#include "clpp11.hpp"
+#include "database/database.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Enqueues a kernel, waits for completion, and checks for errors
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+                     std::vector<size_t> global, const std::vector<size_t> &local,
+                     EventPointer event, std::vector<Event>& waitForEvents);
+
+// As above, but without an event waiting list
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+                     std::vector<size_t> global, const std::vector<size_t> &local,
+                     EventPointer event);
+
+// =================================================================================================
+
+// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
+// to write to symmetric and triangular matrices through optional arguments.
+template <typename T>
+StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context,
+                                  const Database &db,
+                                  EventPointer event, std::vector<Event>& waitForEvents,
+                                  const size_t src_one, const size_t src_two,
+                                  const size_t src_ld, const size_t src_offset,
+                                  const Buffer<T> &src,
+                                  const size_t dest_one, const size_t dest_two,
+                                  const size_t dest_ld, const size_t dest_offset,
+                                  const Buffer<T> &dest,
+                                  const T alpha,
+                                  const Program &program, const bool do_pad,
+                                  const bool do_transpose, const bool do_conjugate,
+                                  const bool upper = false, const bool lower = false,
+                                  const bool diagonal_imag_zero = false) {
+
+  // Determines whether or not the fast-version could potentially be used
+  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
+                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
+                         (upper == false) && (lower == false) && (diagonal_imag_zero == false);
+
+  // Determines the right kernel
+  auto kernel_name = std::string{};
+  if (do_transpose) {
+    if (use_fast_kernel &&
+        IsMultiple(src_ld, db["TRA_WPT"]) &&
+        IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
+        IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
+      kernel_name = "TransposeMatrixFast";
+    }
+    else {
+      use_fast_kernel = false;
+      kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
+    }
+  }
+  else {
+    if (use_fast_kernel &&
+        IsMultiple(src_ld, db["COPY_VW"]) &&
+        IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) &&
+        IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) {
+      kernel_name = "CopyMatrixFast";
+    }
+    else {
+      use_fast_kernel = false;
+      kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
+    }
+  }
+
+  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context, 1);
+  alpha_buffer.Write(queue, 1, &alpha);
+
+  // Retrieves the kernel from the compiled binary
+  try {
+    auto kernel = Kernel(program, kernel_name);
+
+    // Sets the kernel arguments
+    if (use_fast_kernel) {
+      kernel.SetArgument(0, static_cast<int>(src_ld));
+      kernel.SetArgument(1, src());
+      kernel.SetArgument(2, dest());
+      kernel.SetArgument(3, alpha_buffer());
+    }
+    else {
+      kernel.SetArgument(0, static_cast<int>(src_one));
+      kernel.SetArgument(1, static_cast<int>(src_two));
+      kernel.SetArgument(2, static_cast<int>(src_ld));
+      kernel.SetArgument(3, static_cast<int>(src_offset));
+      kernel.SetArgument(4, src());
+      kernel.SetArgument(5, static_cast<int>(dest_one));
+      kernel.SetArgument(6, static_cast<int>(dest_two));
+      kernel.SetArgument(7, static_cast<int>(dest_ld));
+      kernel.SetArgument(8, static_cast<int>(dest_offset));
+      kernel.SetArgument(9, dest());
+      kernel.SetArgument(10, alpha_buffer());
+      if (do_pad) {
+        kernel.SetArgument(11, static_cast<int>(do_conjugate));
+      }
+      else {
+        kernel.SetArgument(11, static_cast<int>(upper));
+        kernel.SetArgument(12, static_cast<int>(lower));
+        kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
+      }
+    }
+
+    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+    // parameters in the database.
+    if (do_transpose) {
+      if (use_fast_kernel) {
+        const auto global = std::vector<size_t>{
+          dest_one / db["TRA_WPT"],
+          dest_two / db["TRA_WPT"]
+        };
+        const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+      else {
+        const auto global = std::vector<size_t>{
+          Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+          Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
+        };
+        const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+    }
+    else {
+      if (use_fast_kernel) {
+        const auto global = std::vector<size_t>{
+          dest_one / db["COPY_VW"],
+          dest_two / db["COPY_WPT"]
+        };
+        const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+      else {
+        const auto global = std::vector<size_t>{
+          Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+          Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
+        };
+        const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+    }
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_COMMON_H_
+#endif
diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc
index b4add2a3..6b6e7f9e 100644
--- a/src/routines/level1/xamax.cc
+++ b/src/routines/level1/xamax.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xamax.h"
+#include "routines/level1/xamax.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level1/xamax.hpp b/src/routines/level1/xamax.hpp
new file mode 100644
index 00000000..aa45a8e4
--- /dev/null
+++ b/src/routines/level1/xamax.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xamax routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAMAX_H_
+#define CLBLAST_ROUTINES_XAMAX_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xamax: public Routine {
+ public:
+
+  // Constructor
+  Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoAmax(const size_t n,
+                    const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAMAX_H_
+#endif
diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc
index 80f04829..0c1ce903 100644
--- a/src/routines/level1/xasum.cc
+++ b/src/routines/level1/xasum.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xasum.h"
+#include "routines/level1/xasum.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level1/xasum.hpp b/src/routines/level1/xasum.hpp
new file mode 100644
index 00000000..5a253f4d
--- /dev/null
+++ b/src/routines/level1/xasum.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xasum routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XASUM_H_
+#define CLBLAST_ROUTINES_XASUM_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xasum: public Routine {
+ public:
+
+  // Constructor
+  Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoAsum(const size_t n,
+                    const Buffer<T> &asum_buffer, const size_t asum_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XASUM_H_
+#endif
diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc
index 4a548757..5b6c9e77 100644
--- a/src/routines/level1/xaxpy.cc
+++ b/src/routines/level1/xaxpy.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xaxpy.h"
+#include "routines/level1/xaxpy.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level1/xaxpy.hpp b/src/routines/level1/xaxpy.hpp
new file mode 100644
index 00000000..caac871e
--- /dev/null
+++ b/src/routines/level1/xaxpy.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAXPY_H_
+#define CLBLAST_ROUTINES_XAXPY_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xaxpy: public Routine {
+ public:
+
+  // Constructor
+  Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoAxpy(const size_t n, const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAXPY_H_
+#endif
diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc
index 92d31786..673ef349 100644
--- a/src/routines/level1/xcopy.cc
+++ b/src/routines/level1/xcopy.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xcopy.h"
+#include "routines/level1/xcopy.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level1/xcopy.hpp b/src/routines/level1/xcopy.hpp
new file mode 100644
index 00000000..0c424ba3
--- /dev/null
+++ b/src/routines/level1/xcopy.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xcopy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XCOPY_H_
+#define CLBLAST_ROUTINES_XCOPY_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xcopy: public Routine {
+ public:
+
+  // Constructor
+  Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoCopy(const size_t n,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XCOPY_H_
+#endif
diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc
index 8709c541..bafea157 100644
--- a/src/routines/level1/xdot.cc
+++ b/src/routines/level1/xdot.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xdot.h"
+#include "routines/level1/xdot.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level1/xdot.hpp b/src/routines/level1/xdot.hpp
new file mode 100644
index 00000000..02c1efaa
--- /dev/null
+++ b/src/routines/level1/xdot.hpp
@@ -0,0 +1,42 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xdot routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XDOT_H_
+#define CLBLAST_ROUTINES_XDOT_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xdot: public Routine {
+ public:
+
+  // Constructor
+  Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoDot(const size_t n,
+                   const Buffer<T> &dot_buffer, const size_t dot_offset,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                   const bool do_conjugate = false);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XDOT_H_
+#endif
diff --git a/src/routines/level1/xdotc.cc b/src/routines/level1/xdotc.cc
index b3a01079..27cf2bab 100644
--- a/src/routines/level1/xdotc.cc
+++ b/src/routines/level1/xdotc.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xdotc.h"
+#include "routines/level1/xdotc.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level1/xdotc.hpp b/src/routines/level1/xdotc.hpp
new file mode 100644
index 00000000..b8cbdaf5
--- /dev/null
+++ b/src/routines/level1/xdotc.hpp
@@ -0,0 +1,44 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xdotc routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XDOTC_H_
+#define CLBLAST_ROUTINES_XDOTC_H_
+
+#include "routines/level1/xdot.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xdotc: public Xdot<T> {
+ public:
+
+  // Uses the regular Xdot routine
+  using Xdot<T>::DoDot;
+
+  // Constructor
+  Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoDotc(const size_t n,
+                    const Buffer<T> &dot_buffer, const size_t dot_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XDOTC_H_
+#endif
diff --git a/src/routines/level1/xdotu.cc b/src/routines/level1/xdotu.cc
index 8dded6e0..0bce70b7 100644
--- a/src/routines/level1/xdotu.cc
+++ b/src/routines/level1/xdotu.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xdotu.h"
+#include "routines/level1/xdotu.hpp"
 
 #include <string>
 
diff --git a/src/routines/level1/xdotu.hpp b/src/routines/level1/xdotu.hpp
new file mode 100644
index 00000000..b3f73086
--- /dev/null
+++ b/src/routines/level1/xdotu.hpp
@@ -0,0 +1,44 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xdotu routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XDOTU_H_
+#define CLBLAST_ROUTINES_XDOTU_H_
+
+#include "routines/level1/xdot.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xdotu: public Xdot<T> {
+ public:
+
+  // Uses the regular Xdot routine
+  using Xdot<T>::DoDot;
+
+  // Constructor
+  Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoDotu(const size_t n,
+                    const Buffer<T> &dot_buffer, const size_t dot_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XDOTU_H_
+#endif
diff --git a/src/routines/level1/xmax.hpp b/src/routines/level1/xmax.hpp
new file mode 100644
index 00000000..5a0236f2
--- /dev/null
+++ b/src/routines/level1/xmax.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xmax routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XMAX_H_
+#define CLBLAST_ROUTINES_XMAX_H_
+
+#include "routine.hpp"
+#include "routines/level1/xamax.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xmax: public Xamax<T> {
+ public:
+
+  // Members and methods from the base class
+  using Xamax<T>::DoAmax;
+
+  // Constructor
+  Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"):
+    Xamax<T>(queue, event, name) {
+  }
+
+  // Forwards to the regular absolute version. The implementation difference is realised in the
+  // kernel through a pre-processor macro based on the name of the routine.
+  StatusCode DoMax(const size_t n,
+                   const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XMAX_H_
+#endif
diff --git a/src/routines/level1/xmin.hpp b/src/routines/level1/xmin.hpp
new file mode 100644
index 00000000..6befec64
--- /dev/null
+++ b/src/routines/level1/xmin.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xmin routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XMIN_H_
+#define CLBLAST_ROUTINES_XMIN_H_
+
+#include "routine.hpp"
+#include "routines/level1/xamax.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xmin: public Xamax<T> {
+ public:
+
+  // Members and methods from the base class
+  using Xamax<T>::DoAmax;
+
+  // Constructor
+  Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"):
+    Xamax<T>(queue, event, name) {
+  }
+
+  // Forwards to the regular max-absolute version. The implementation difference is realised in the
+  // kernel through a pre-processor macro based on the name of the routine.
+  StatusCode DoMin(const size_t n,
+                   const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XMIN_H_
+#endif
diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc
index 105f991c..97615d8b 100644
--- a/src/routines/level1/xnrm2.cc
+++ b/src/routines/level1/xnrm2.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xnrm2.h"
+#include "routines/level1/xnrm2.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level1/xnrm2.hpp b/src/routines/level1/xnrm2.hpp
new file mode 100644
index 00000000..7baf07f5
--- /dev/null
+++ b/src/routines/level1/xnrm2.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xnrm2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XNRM2_H_
+#define CLBLAST_ROUTINES_XNRM2_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xnrm2: public Routine {
+ public:
+
+  // Constructor
+  Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoNrm2(const size_t n,
+                    const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XNRM2_H_
+#endif
diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc
index 3c1b5257..bcc43c3b 100644
--- a/src/routines/level1/xscal.cc
+++ b/src/routines/level1/xscal.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xscal.h"
+#include "routines/level1/xscal.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level1/xscal.hpp b/src/routines/level1/xscal.hpp
new file mode 100644
index 00000000..6c585cb2
--- /dev/null
+++ b/src/routines/level1/xscal.hpp
@@ -0,0 +1,39 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xscal routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSCAL_H_
+#define CLBLAST_ROUTINES_XSCAL_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xscal: public Routine {
+ public:
+
+  // Constructor
+  Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoScal(const size_t n, const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSCAL_H_
+#endif
diff --git a/src/routines/level1/xsum.hpp b/src/routines/level1/xsum.hpp
new file mode 100644
index 00000000..84e20bea
--- /dev/null
+++ b/src/routines/level1/xsum.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsum routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSUM_H_
+#define CLBLAST_ROUTINES_XSUM_H_
+
+#include "routine.hpp"
+#include "routines/level1/xasum.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsum: public Xasum<T> {
+ public:
+
+  // Members and methods from the base class
+  using Xasum<T>::DoAsum;
+
+  // Constructor
+  Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"):
+    Xasum<T>(queue, event, name) {
+  }
+
+  // Forwards to the regular absolute version. The implementation difference is realised in the
+  // kernel through a pre-processor macro based on the name of the routine.
+  StatusCode DoSum(const size_t n,
+                   const Buffer<T> &sum_buffer, const size_t sum_offset,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSUM_H_
+#endif
diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc
index 27eb9b13..03907cbd 100644
--- a/src/routines/level1/xswap.cc
+++ b/src/routines/level1/xswap.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xswap.h"
+#include "routines/level1/xswap.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level1/xswap.hpp b/src/routines/level1/xswap.hpp
new file mode 100644
index 00000000..4f9ea36d
--- /dev/null
+++ b/src/routines/level1/xswap.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xswap routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSWAP_H_
+#define CLBLAST_ROUTINES_XSWAP_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xswap: public Routine {
+ public:
+
+  // Constructor
+  Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSwap(const size_t n,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSWAP_H_
+#endif
diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc
index 7a30c34a..ea4f001c 100644
--- a/src/routines/level2/xgbmv.cc
+++ b/src/routines/level2/xgbmv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgbmv.h"
+#include "routines/level2/xgbmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xgbmv.hpp b/src/routines/level2/xgbmv.hpp
new file mode 100644
index 00000000..686ab642
--- /dev/null
+++ b/src/routines/level2/xgbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xgbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGBMV_H_
+#define CLBLAST_ROUTINES_XGBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgbmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n, const size_t kl, const size_t ku,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGBMV_H_
+#endif
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
index ccadd131..21fb397c 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xgemv.hpp b/src/routines/level2/xgemv.hpp
new file mode 100644
index 00000000..e9afec8d
--- /dev/null
+++ b/src/routines/level2/xgemv.hpp
@@ -0,0 +1,56 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemv routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGEMV_H_
+#define CLBLAST_ROUTINES_XGEMV_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgemv: public Routine {
+ public:
+
+  // Constructor
+  Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+
+  // Generic version used also for other matrix-vector multiplications
+  StatusCode MatVec(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    bool fast_kernel, bool fast_kernel_rot,
+                    const size_t parameter, const bool packed,
+                    const size_t kl, const size_t ku);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGEMV_H_
+#endif
diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc
index 6ceaa00e..353047d2 100644
--- a/src/routines/level2/xger.cc
+++ b/src/routines/level2/xger.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xger.h"
+#include "routines/level2/xger.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xger.hpp b/src/routines/level2/xger.hpp
new file mode 100644
index 00000000..3c6abe44
--- /dev/null
+++ b/src/routines/level2/xger.hpp
@@ -0,0 +1,43 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xger routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGER_H_
+#define CLBLAST_ROUTINES_XGER_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xger: public Routine {
+ public:
+
+  // Constructor
+  Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGer(const Layout layout,
+                   const size_t m, const size_t n,
+                   const T alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGER_H_
+#endif
diff --git a/src/routines/level2/xgerc.cc b/src/routines/level2/xgerc.cc
index 73284b52..d9feda97 100644
--- a/src/routines/level2/xgerc.cc
+++ b/src/routines/level2/xgerc.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgerc.h"
+#include "routines/level2/xgerc.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xgerc.hpp b/src/routines/level2/xgerc.hpp
new file mode 100644
index 00000000..f1d04dfd
--- /dev/null
+++ b/src/routines/level2/xgerc.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgerc routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGERC_H_
+#define CLBLAST_ROUTINES_XGERC_H_
+
+#include "routines/level2/xger.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgerc: public Xger<T> {
+ public:
+
+  // Uses the regular Xger routine
+  using Xger<T>::DoGer;
+
+  // Constructor
+  Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGerc(const Layout layout,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGERC_H_
+#endif
diff --git a/src/routines/level2/xgeru.cc b/src/routines/level2/xgeru.cc
index 7730d6a5..da9e91c2 100644
--- a/src/routines/level2/xgeru.cc
+++ b/src/routines/level2/xgeru.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgeru.h"
+#include "routines/level2/xgeru.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xgeru.hpp b/src/routines/level2/xgeru.hpp
new file mode 100644
index 00000000..fb50e917
--- /dev/null
+++ b/src/routines/level2/xgeru.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgeru routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGERU_H_
+#define CLBLAST_ROUTINES_XGERU_H_
+
+#include "routines/level2/xger.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgeru: public Xger<T> {
+ public:
+
+  // Uses the regular Xger routine
+  using Xger<T>::DoGer;
+
+  // Constructor
+  Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGeru(const Layout layout,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGERU_H_
+#endif
diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc
index 58591b50..f6c0e3c4 100644
--- a/src/routines/level2/xhbmv.cc
+++ b/src/routines/level2/xhbmv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhbmv.h"
+#include "routines/level2/xhbmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xhbmv.hpp b/src/routines/level2/xhbmv.hpp
new file mode 100644
index 00000000..d668eb88
--- /dev/null
+++ b/src/routines/level2/xhbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHBMV_H_
+#define CLBLAST_ROUTINES_XHBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhbmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHbmv(const Layout layout, const Triangle triangle,
+                    const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHBMV_H_
+#endif
diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc
index b4ef0fa4..2cbcf7b4 100644
--- a/src/routines/level2/xhemv.cc
+++ b/src/routines/level2/xhemv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhemv.h"
+#include "routines/level2/xhemv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xhemv.hpp b/src/routines/level2/xhemv.hpp
new file mode 100644
index 00000000..8e062fd3
--- /dev/null
+++ b/src/routines/level2/xhemv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhemv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHEMV_H_
+#define CLBLAST_ROUTINES_XHEMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhemv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHemv(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHEMV_H_
+#endif
diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc
index 939e17bb..ed8ba9e9 100644
--- a/src/routines/level2/xher.cc
+++ b/src/routines/level2/xher.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xher.h"
+#include "routines/level2/xher.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xher.hpp b/src/routines/level2/xher.hpp
new file mode 100644
index 00000000..9ff6bf3f
--- /dev/null
+++ b/src/routines/level2/xher.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER_H_
+#define CLBLAST_ROUTINES_XHER_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xher: public Routine {
+ public:
+
+  // Constructor
+  Xher(Queue &queue, EventPointer event, const std::string &name = "HER");
+
+  // Translates alpha of type 'U' into type 'T'
+  T GetAlpha(const U alpha);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHer(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const U alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                   const bool packed = false);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER_H_
+#endif
diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc
index 95dbd87a..50572cea 100644
--- a/src/routines/level2/xher2.cc
+++ b/src/routines/level2/xher2.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xher2.h"
+#include "routines/level2/xher2.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xher2.hpp b/src/routines/level2/xher2.hpp
new file mode 100644
index 00000000..8c53c047
--- /dev/null
+++ b/src/routines/level2/xher2.hpp
@@ -0,0 +1,44 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER2_H_
+#define CLBLAST_ROUTINES_XHER2_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xher2: public Routine {
+ public:
+
+  // Constructor
+  Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHer2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const bool packed = false);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER2_H_
+#endif
diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc
index 92686dbe..e6f82b34 100644
--- a/src/routines/level2/xhpmv.cc
+++ b/src/routines/level2/xhpmv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhpmv.h"
+#include "routines/level2/xhpmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xhpmv.hpp b/src/routines/level2/xhpmv.hpp
new file mode 100644
index 00000000..b11192f9
--- /dev/null
+++ b/src/routines/level2/xhpmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xhpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPMV_H_
+#define CLBLAST_ROUTINES_XHPMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhpmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHpmv(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPMV_H_
+#endif
diff --git a/src/routines/level2/xhpr.cc b/src/routines/level2/xhpr.cc
index 4b31ad09..225ebfe5 100644
--- a/src/routines/level2/xhpr.cc
+++ b/src/routines/level2/xhpr.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhpr.h"
+#include "routines/level2/xhpr.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xhpr.hpp b/src/routines/level2/xhpr.hpp
new file mode 100644
index 00000000..37801c68
--- /dev/null
+++ b/src/routines/level2/xhpr.hpp
@@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPR_H_
+#define CLBLAST_ROUTINES_XHPR_H_
+
+#include "routines/level2/xher.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xhpr: public Xher<T,U> {
+ public:
+
+  // Uses the regular Xher routine
+  using Xher<T,U>::DoHer;
+
+  // Constructor
+  Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHpr(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const U alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPR_H_
+#endif
diff --git a/src/routines/level2/xhpr2.cc b/src/routines/level2/xhpr2.cc
index 9be24f43..85f9d3f9 100644
--- a/src/routines/level2/xhpr2.cc
+++ b/src/routines/level2/xhpr2.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhpr2.h"
+#include "routines/level2/xhpr2.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xhpr2.hpp b/src/routines/level2/xhpr2.hpp
new file mode 100644
index 00000000..d66dce55
--- /dev/null
+++ b/src/routines/level2/xhpr2.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhpr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHPR2_H_
+#define CLBLAST_ROUTINES_XHPR2_H_
+
+#include "routines/level2/xher2.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhpr2: public Xher2<T> {
+ public:
+
+  // Uses the regular Xher2 routine
+  using Xher2<T>::DoHer2;
+
+  // Constructor
+  Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHpr2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHPR2_H_
+#endif
diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc
index 66ba74e8..28730899 100644
--- a/src/routines/level2/xsbmv.cc
+++ b/src/routines/level2/xsbmv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsbmv.h"
+#include "routines/level2/xsbmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xsbmv.hpp b/src/routines/level2/xsbmv.hpp
new file mode 100644
index 00000000..16c5e9a8
--- /dev/null
+++ b/src/routines/level2/xsbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xsbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSBMV_H_
+#define CLBLAST_ROUTINES_XSBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsbmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSbmv(const Layout layout, const Triangle triangle,
+                    const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSBMV_H_
+#endif
diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc
index 589a97d4..f6651012 100644
--- a/src/routines/level2/xspmv.cc
+++ b/src/routines/level2/xspmv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xspmv.h"
+#include "routines/level2/xspmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xspmv.hpp b/src/routines/level2/xspmv.hpp
new file mode 100644
index 00000000..a0c69b85
--- /dev/null
+++ b/src/routines/level2/xspmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xspmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPMV_H_
+#define CLBLAST_ROUTINES_XSPMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSpmv(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPMV_H_
+#endif
diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cc
index c556b920..a75fe9c3 100644
--- a/src/routines/level2/xspr.cc
+++ b/src/routines/level2/xspr.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xspr.h"
+#include "routines/level2/xspr.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xspr.hpp b/src/routines/level2/xspr.hpp
new file mode 100644
index 00000000..6468c736
--- /dev/null
+++ b/src/routines/level2/xspr.hpp
@@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPR_H_
+#define CLBLAST_ROUTINES_XSPR_H_
+
+#include "routines/level2/xher.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspr: public Xher<T,T> {
+ public:
+
+  // Uses the regular Xher routine
+  using Xher<T,T>::DoHer;
+
+  // Constructor
+  Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSpr(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const T alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPR_H_
+#endif
diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cc
index c4ad5dc4..c39a2eb4 100644
--- a/src/routines/level2/xspr2.cc
+++ b/src/routines/level2/xspr2.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xspr2.h"
+#include "routines/level2/xspr2.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xspr2.hpp b/src/routines/level2/xspr2.hpp
new file mode 100644
index 00000000..693c56a1
--- /dev/null
+++ b/src/routines/level2/xspr2.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xspr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSPR2_H_
+#define CLBLAST_ROUTINES_XSPR2_H_
+
+#include "routines/level2/xher2.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xspr2: public Xher2<T> {
+ public:
+
+  // Uses the regular Xher2 routine
+  using Xher2<T>::DoHer2;
+
+  // Constructor
+  Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSpr2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSPR2_H_
+#endif
diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc
index 2a404a8a..648d2a3e 100644
--- a/src/routines/level2/xsymv.cc
+++ b/src/routines/level2/xsymv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsymv.h"
+#include "routines/level2/xsymv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xsymv.hpp b/src/routines/level2/xsymv.hpp
new file mode 100644
index 00000000..67815f2f
--- /dev/null
+++ b/src/routines/level2/xsymv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xsymv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYMV_H_
+#define CLBLAST_ROUTINES_XSYMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsymv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSymv(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYMV_H_
+#endif
diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cc
index 892517d7..758d8f8f 100644
--- a/src/routines/level2/xsyr.cc
+++ b/src/routines/level2/xsyr.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsyr.h"
+#include "routines/level2/xsyr.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xsyr.hpp b/src/routines/level2/xsyr.hpp
new file mode 100644
index 00000000..20393454
--- /dev/null
+++ b/src/routines/level2/xsyr.hpp
@@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR_H_
+#define CLBLAST_ROUTINES_XSYR_H_
+
+#include "routines/level2/xher.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr: public Xher<T,T> {
+ public:
+
+  // Uses the regular Xher routine
+  using Xher<T,T>::DoHer;
+
+  // Constructor
+  Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyr(const Layout layout, const Triangle triangle,
+                   const size_t n,
+                   const T alpha,
+                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR_H_
+#endif
diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cc
index e6dfd158..6f43b219 100644
--- a/src/routines/level2/xsyr2.cc
+++ b/src/routines/level2/xsyr2.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsyr2.h"
+#include "routines/level2/xsyr2.hpp"
 
 #include <string>
 
diff --git a/src/routines/level2/xsyr2.hpp b/src/routines/level2/xsyr2.hpp
new file mode 100644
index 00000000..1a8dcbe8
--- /dev/null
+++ b/src/routines/level2/xsyr2.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2 routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR2_H_
+#define CLBLAST_ROUTINES_XSYR2_H_
+
+#include "routines/level2/xher2.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr2: public Xher2<T> {
+ public:
+
+  // Uses the regular Xher2 routine
+  using Xher2<T>::DoHer2;
+
+  // Constructor
+  Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyr2(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR2_H_
+#endif
diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc
index 86e28dfb..e315c544 100644
--- a/src/routines/level2/xtbmv.cc
+++ b/src/routines/level2/xtbmv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xtbmv.h"
+#include "routines/level2/xtbmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xtbmv.hpp b/src/routines/level2/xtbmv.hpp
new file mode 100644
index 00000000..389e9705
--- /dev/null
+++ b/src/routines/level2/xtbmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtbmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtbmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTBMV_H_
+#define CLBLAST_ROUTINES_XTBMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtbmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTbmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n, const size_t k,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTBMV_H_
+#endif
diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc
index 72445547..46811089 100644
--- a/src/routines/level2/xtpmv.cc
+++ b/src/routines/level2/xtpmv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xtpmv.h"
+#include "routines/level2/xtpmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xtpmv.hpp b/src/routines/level2/xtpmv.hpp
new file mode 100644
index 00000000..0e8cf1d2
--- /dev/null
+++ b/src/routines/level2/xtpmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtpmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtpmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTPMV_H_
+#define CLBLAST_ROUTINES_XTPMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtpmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTpmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTPMV_H_
+#endif
diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc
index df6f85a3..d2f24252 100644
--- a/src/routines/level2/xtrmv.cc
+++ b/src/routines/level2/xtrmv.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xtrmv.h"
+#include "routines/level2/xtrmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level2/xtrmv.hpp b/src/routines/level2/xtrmv.hpp
new file mode 100644
index 00000000..07dd7841
--- /dev/null
+++ b/src/routines/level2/xtrmv.hpp
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmv routine. It is based on the generalized mat-vec multiplication
+// routine (Xgemv). The Xtrmv class inherits from the templated class Xgemv, allowing it to call the
+// "MatVec" function directly.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMV_H_
+#define CLBLAST_ROUTINES_XTRMV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTrmv(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMV_H_
+#endif
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index 8386ad09..9ea5559c 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xgemm.h"
+#include "routines/level3/xgemm.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp
new file mode 100644
index 00000000..71723d78
--- /dev/null
+++ b/src/routines/level3/xgemm.hpp
@@ -0,0 +1,48 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGEMM_H_
+#define CLBLAST_ROUTINES_XGEMM_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgemm: public Routine {
+ public:
+
+  // Constructor
+  Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                    const size_t m, const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ protected:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGEMM_H_
+#endif
diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc
index 8120c09c..9813503e 100644
--- a/src/routines/level3/xhemm.cc
+++ b/src/routines/level3/xhemm.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xhemm.h"
+#include "routines/level3/xhemm.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp
new file mode 100644
index 00000000..d79b42a1
--- /dev/null
+++ b/src/routines/level3/xhemm.hpp
@@ -0,0 +1,54 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemm routine. It is based on the generalized matrix multiplication
+// routine (Xgemm). The implementation is very similar to the Xsymm routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHEMM_H_
+#define CLBLAST_ROUTINES_XHEMM_H_
+
+#include "routines/level3/xgemm.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhemm: public Xgemm<T> {
+ public:
+
+  // Uses methods and variables the regular Xgemm routine
+  using Xgemm<T>::precision_;
+  using Xgemm<T>::routine_name_;
+  using Xgemm<T>::queue_;
+  using Xgemm<T>::context_;
+  using Xgemm<T>::device_;
+  using Xgemm<T>::db_;
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHEMM_H_
+#endif
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
index bd0f83dd..bd7a053e 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xher2k.h"
+#include "routines/level3/xher2k.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level3/xher2k.hpp b/src/routines/level3/xher2k.hpp
new file mode 100644
index 00000000..23996219
--- /dev/null
+++ b/src/routines/level3/xher2k.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2k routine. The precision is implemented using the template argument
+// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
+// Xsyr2k routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER2K_H_
+#define CLBLAST_ROUTINES_XHER2K_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xher2k: public Routine {
+ public:
+
+  // Constructor
+  Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                     const size_t n, const size_t k,
+                     const T alpha,
+                     const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                     const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                     const U beta,
+                     const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER2K_H_
+#endif
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
index 6155734a..6ef7f21f 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xherk.h"
+#include "routines/level3/xherk.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level3/xherk.hpp b/src/routines/level3/xherk.hpp
new file mode 100644
index 00000000..3f156a1b
--- /dev/null
+++ b/src/routines/level3/xherk.hpp
@@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xherk routine. The precision is implemented using the template argument
+// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
+// Xsyrk routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHERK_H_
+#define CLBLAST_ROUTINES_XHERK_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xherk: public Routine {
+ public:
+
+  // Constructor
+  Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                    const size_t n, const size_t k,
+                    const U alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const U beta,
+                    const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHERK_H_
+#endif
diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc
index c5e56617..04e4b718 100644
--- a/src/routines/level3/xsymm.cc
+++ b/src/routines/level3/xsymm.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xsymm.h"
+#include "routines/level3/xsymm.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp
new file mode 100644
index 00000000..754dd7a0
--- /dev/null
+++ b/src/routines/level3/xsymm.hpp
@@ -0,0 +1,56 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm routine. It is based on the generalized matrix multiplication
+// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the
+// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by
+// transforming it into a general matrix, and then calls the regular GEMM code.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYMM_H_
+#define CLBLAST_ROUTINES_XSYMM_H_
+
+#include "routines/level3/xgemm.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsymm: public Xgemm<T> {
+ public:
+
+  // Uses methods and variables the regular Xgemm routine
+  using Xgemm<T>::precision_;
+  using Xgemm<T>::routine_name_;
+  using Xgemm<T>::queue_;
+  using Xgemm<T>::context_;
+  using Xgemm<T>::device_;
+  using Xgemm<T>::db_;
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYMM_H_
+#endif
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
index f9655889..424d4d2d 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xsyr2k.h"
+#include "routines/level3/xsyr2k.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level3/xsyr2k.hpp b/src/routines/level3/xsyr2k.hpp
new file mode 100644
index 00000000..56185653
--- /dev/null
+++ b/src/routines/level3/xsyr2k.hpp
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
+// The implementation is very similar to Xsyrk (see header for details), except for the fact that
+// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR2K_H_
+#define CLBLAST_ROUTINES_XSYR2K_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr2k: public Routine {
+ public:
+
+  // Constructor
+  Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                     const size_t n, const size_t k,
+                     const T alpha,
+                     const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                     const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                     const T beta,
+                     const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR2K_H_
+#endif
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
index bceb6afd..f56c232b 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xsyrk.h"
+#include "routines/level3/xsyrk.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level3/xsyrk.hpp b/src/routines/level3/xsyrk.hpp
new file mode 100644
index 00000000..7c075c26
--- /dev/null
+++ b/src/routines/level3/xsyrk.hpp
@@ -0,0 +1,47 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk routine. The precision is implemented using a template argument.
+// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
+// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
+// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
+//    performance reasons, as the actual masking is done later (see the first point).
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYRK_H_
+#define CLBLAST_ROUTINES_XSYRK_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyrk: public Routine {
+ public:
+
+  // Constructor
+  Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                    const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const T beta,
+                    const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYRK_H_
+#endif
diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc
index 92dda9fb..74a82822 100644
--- a/src/routines/level3/xtrmm.cc
+++ b/src/routines/level3/xtrmm.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xtrmm.h"
+#include "routines/level3/xtrmm.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp
new file mode 100644
index 00000000..bb435592
--- /dev/null
+++ b/src/routines/level3/xtrmm.hpp
@@ -0,0 +1,54 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmm routine. The implementation is based on first transforming the
+// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM
+// routine. Therefore, this class inherits from the Xgemm class.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMM_H_
+#define CLBLAST_ROUTINES_XTRMM_H_
+
+#include "routines/level3/xgemm.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmm: public Xgemm<T> {
+ public:
+
+  // Uses methods and variables the regular Xgemm routine
+  using Xgemm<T>::precision_;
+  using Xgemm<T>::routine_name_;
+  using Xgemm<T>::queue_;
+  using Xgemm<T>::context_;
+  using Xgemm<T>::device_;
+  using Xgemm<T>::db_;
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMM_H_
+#endif
diff --git a/src/routines/levelx/xomatcopy.cc b/src/routines/levelx/xomatcopy.cc
index 6e4bddb2..e8593301 100644
--- a/src/routines/levelx/xomatcopy.cc
+++ b/src/routines/levelx/xomatcopy.cc
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/levelx/xomatcopy.h"
+#include "routines/levelx/xomatcopy.hpp"
 
 #include <string>
 #include <vector>
diff --git a/src/routines/levelx/xomatcopy.hpp b/src/routines/levelx/xomatcopy.hpp
new file mode 100644
index 00000000..0e580230
--- /dev/null
+++ b/src/routines/levelx/xomatcopy.hpp
@@ -0,0 +1,41 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xomatcopy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XOMATCOPY_H_
+#define CLBLAST_ROUTINES_XOMATCOPY_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xomatcopy: public Routine {
+ public:
+
+  // Constructor
+  Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose,
+                        const size_t m, const size_t n, const T alpha,
+                        const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XOMATCOPY_H_
+#endif