From aa852bbe67a7dc9018afd7d1349184f0284d215c Mon Sep 17 00:00:00 2001
From: CNugteren <web@cedricnugteren.nl>
Date: Sun, 12 Jul 2015 16:57:09 +0200
Subject: Added subfolders for the level1/2/3 routines

---
 CMakeLists.txt                             |  47 ++++++--
 include/internal/routines/level1/xaxpy.h   |  42 +++++++
 include/internal/routines/level2/xgemv.h   |  46 ++++++++
 include/internal/routines/level3/xgemm.h   |  46 ++++++++
 include/internal/routines/level3/xhemm.h   |  58 ++++++++++
 include/internal/routines/level3/xher2k.h  |  48 ++++++++
 include/internal/routines/level3/xherk.h   |  47 ++++++++
 include/internal/routines/level3/xsymm.h   |  60 ++++++++++
 include/internal/routines/level3/xsyr2k.h  |  48 ++++++++
 include/internal/routines/level3/xsyrk.h   |  49 ++++++++
 include/internal/routines/level3/xtrmm.h   |  58 ++++++++++
 include/internal/routines/xaxpy.h          |  42 -------
 include/internal/routines/xgemm.h          |  46 --------
 include/internal/routines/xgemv.h          |  46 --------
 include/internal/routines/xhemm.h          |  58 ----------
 include/internal/routines/xher2k.h         |  48 --------
 include/internal/routines/xherk.h          |  47 --------
 include/internal/routines/xsymm.h          |  60 ----------
 include/internal/routines/xsyr2k.h         |  48 --------
 include/internal/routines/xsyrk.h          |  49 --------
 include/internal/routines/xtrmm.h          |  58 ----------
 src/clblast.cc                             |  20 ++--
 src/routines/level1/xaxpy.cc               | 115 +++++++++++++++++++
 src/routines/level2/xgemv.cc               | 146 +++++++++++++++++++++++
 src/routines/level3/xgemm.cc               | 172 ++++++++++++++++++++++++++++
 src/routines/level3/xhemm.cc               | 130 +++++++++++++++++++++
 src/routines/level3/xher2k.cc              | 178 +++++++++++++++++++++++++++++
 src/routines/level3/xherk.cc               | 156 +++++++++++++++++++++++++
 src/routines/level3/xsymm.cc               | 132 +++++++++++++++++++++
 src/routines/level3/xsyr2k.cc              | 166 +++++++++++++++++++++++++++
 src/routines/level3/xsyrk.cc               | 147 ++++++++++++++++++++++++
 src/routines/level3/xtrmm.cc               | 135 ++++++++++++++++++++++
 src/routines/xaxpy.cc                      | 115 -------------------
 src/routines/xgemm.cc                      | 172 ----------------------------
 src/routines/xgemv.cc                      | 146 -----------------------
 src/routines/xhemm.cc                      | 130 ---------------------
 src/routines/xher2k.cc                     | 178 -----------------------------
 src/routines/xherk.cc                      | 156 -------------------------
 src/routines/xsymm.cc                      | 132 ---------------------
 src/routines/xsyr2k.cc                     | 166 ---------------------------
 src/routines/xsyrk.cc                      | 147 ------------------------
 src/routines/xtrmm.cc                      | 135 ----------------------
 test/correctness/routines/level1/xaxpy.cc  |  81 +++++++++++++
 test/correctness/routines/level2/xgemv.cc  |  99 ++++++++++++++++
 test/correctness/routines/level3/xgemm.cc  | 102 +++++++++++++++++
 test/correctness/routines/level3/xhemm.cc  |  98 ++++++++++++++++
 test/correctness/routines/level3/xher2k.cc | 100 ++++++++++++++++
 test/correctness/routines/level3/xherk.cc  |  92 +++++++++++++++
 test/correctness/routines/level3/xsymm.cc  | 100 ++++++++++++++++
 test/correctness/routines/level3/xsyr2k.cc | 102 +++++++++++++++++
 test/correctness/routines/level3/xsyrk.cc  |  94 +++++++++++++++
 test/correctness/routines/level3/xtrmm.cc  |  96 ++++++++++++++++
 test/correctness/routines/xaxpy.cc         |  81 -------------
 test/correctness/routines/xgemm.cc         | 102 -----------------
 test/correctness/routines/xgemv.cc         |  99 ----------------
 test/correctness/routines/xhemm.cc         |  98 ----------------
 test/correctness/routines/xher2k.cc        | 100 ----------------
 test/correctness/routines/xherk.cc         |  92 ---------------
 test/correctness/routines/xsymm.cc         | 100 ----------------
 test/correctness/routines/xsyr2k.cc        | 102 -----------------
 test/correctness/routines/xsyrk.cc         |  94 ---------------
 test/correctness/routines/xtrmm.cc         |  96 ----------------
 test/performance/routines/level1/xaxpy.cc  |  40 +++++++
 test/performance/routines/level2/xgemv.cc  |  40 +++++++
 test/performance/routines/level3/xgemm.cc  |  40 +++++++
 test/performance/routines/level3/xhemm.cc  |  40 +++++++
 test/performance/routines/level3/xher2k.cc |  40 +++++++
 test/performance/routines/level3/xherk.cc  |  40 +++++++
 test/performance/routines/level3/xsymm.cc  |  40 +++++++
 test/performance/routines/level3/xsyr2k.cc |  40 +++++++
 test/performance/routines/level3/xsyrk.cc  |  40 +++++++
 test/performance/routines/level3/xtrmm.cc  |  40 +++++++
 test/performance/routines/xaxpy.cc         |  40 -------
 test/performance/routines/xgemm.cc         |  40 -------
 test/performance/routines/xgemv.cc         |  40 -------
 test/performance/routines/xhemm.cc         |  40 -------
 test/performance/routines/xher2k.cc        |  40 -------
 test/performance/routines/xherk.cc         |  40 -------
 test/performance/routines/xsymm.cc         |  40 -------
 test/performance/routines/xsyr2k.cc        |  40 -------
 test/performance/routines/xsyrk.cc         |  40 -------
 test/performance/routines/xtrmm.cc         |  40 -------
 test/routines/level1/xaxpy.h               | 113 ++++++++++++++++++
 test/routines/level2/xgemv.h               | 132 +++++++++++++++++++++
 test/routines/level3/xgemm.h               | 134 ++++++++++++++++++++++
 test/routines/level3/xhemm.h               | 134 ++++++++++++++++++++++
 test/routines/level3/xher2k.h              | 132 +++++++++++++++++++++
 test/routines/level3/xherk.h               | 121 ++++++++++++++++++++
 test/routines/level3/xsymm.h               | 134 ++++++++++++++++++++++
 test/routines/level3/xsyr2k.h              | 130 +++++++++++++++++++++
 test/routines/level3/xsyrk.h               | 121 ++++++++++++++++++++
 test/routines/level3/xtrmm.h               | 127 ++++++++++++++++++++
 test/routines/xaxpy.h                      | 113 ------------------
 test/routines/xgemm.h                      | 134 ----------------------
 test/routines/xgemv.h                      | 132 ---------------------
 test/routines/xhemm.h                      | 134 ----------------------
 test/routines/xher2k.h                     | 132 ---------------------
 test/routines/xherk.h                      | 121 --------------------
 test/routines/xsymm.h                      | 134 ----------------------
 test/routines/xsyr2k.h                     | 130 ---------------------
 test/routines/xsyrk.h                      | 121 --------------------
 test/routines/xtrmm.h                      | 127 --------------------
 102 files changed, 4667 insertions(+), 4642 deletions(-)
 create mode 100644 include/internal/routines/level1/xaxpy.h
 create mode 100644 include/internal/routines/level2/xgemv.h
 create mode 100644 include/internal/routines/level3/xgemm.h
 create mode 100644 include/internal/routines/level3/xhemm.h
 create mode 100644 include/internal/routines/level3/xher2k.h
 create mode 100644 include/internal/routines/level3/xherk.h
 create mode 100644 include/internal/routines/level3/xsymm.h
 create mode 100644 include/internal/routines/level3/xsyr2k.h
 create mode 100644 include/internal/routines/level3/xsyrk.h
 create mode 100644 include/internal/routines/level3/xtrmm.h
 delete mode 100644 include/internal/routines/xaxpy.h
 delete mode 100644 include/internal/routines/xgemm.h
 delete mode 100644 include/internal/routines/xgemv.h
 delete mode 100644 include/internal/routines/xhemm.h
 delete mode 100644 include/internal/routines/xher2k.h
 delete mode 100644 include/internal/routines/xherk.h
 delete mode 100644 include/internal/routines/xsymm.h
 delete mode 100644 include/internal/routines/xsyr2k.h
 delete mode 100644 include/internal/routines/xsyrk.h
 delete mode 100644 include/internal/routines/xtrmm.h
 create mode 100644 src/routines/level1/xaxpy.cc
 create mode 100644 src/routines/level2/xgemv.cc
 create mode 100644 src/routines/level3/xgemm.cc
 create mode 100644 src/routines/level3/xhemm.cc
 create mode 100644 src/routines/level3/xher2k.cc
 create mode 100644 src/routines/level3/xherk.cc
 create mode 100644 src/routines/level3/xsymm.cc
 create mode 100644 src/routines/level3/xsyr2k.cc
 create mode 100644 src/routines/level3/xsyrk.cc
 create mode 100644 src/routines/level3/xtrmm.cc
 delete mode 100644 src/routines/xaxpy.cc
 delete mode 100644 src/routines/xgemm.cc
 delete mode 100644 src/routines/xgemv.cc
 delete mode 100644 src/routines/xhemm.cc
 delete mode 100644 src/routines/xher2k.cc
 delete mode 100644 src/routines/xherk.cc
 delete mode 100644 src/routines/xsymm.cc
 delete mode 100644 src/routines/xsyr2k.cc
 delete mode 100644 src/routines/xsyrk.cc
 delete mode 100644 src/routines/xtrmm.cc
 create mode 100644 test/correctness/routines/level1/xaxpy.cc
 create mode 100644 test/correctness/routines/level2/xgemv.cc
 create mode 100644 test/correctness/routines/level3/xgemm.cc
 create mode 100644 test/correctness/routines/level3/xhemm.cc
 create mode 100644 test/correctness/routines/level3/xher2k.cc
 create mode 100644 test/correctness/routines/level3/xherk.cc
 create mode 100644 test/correctness/routines/level3/xsymm.cc
 create mode 100644 test/correctness/routines/level3/xsyr2k.cc
 create mode 100644 test/correctness/routines/level3/xsyrk.cc
 create mode 100644 test/correctness/routines/level3/xtrmm.cc
 delete mode 100644 test/correctness/routines/xaxpy.cc
 delete mode 100644 test/correctness/routines/xgemm.cc
 delete mode 100644 test/correctness/routines/xgemv.cc
 delete mode 100644 test/correctness/routines/xhemm.cc
 delete mode 100644 test/correctness/routines/xher2k.cc
 delete mode 100644 test/correctness/routines/xherk.cc
 delete mode 100644 test/correctness/routines/xsymm.cc
 delete mode 100644 test/correctness/routines/xsyr2k.cc
 delete mode 100644 test/correctness/routines/xsyrk.cc
 delete mode 100644 test/correctness/routines/xtrmm.cc
 create mode 100644 test/performance/routines/level1/xaxpy.cc
 create mode 100644 test/performance/routines/level2/xgemv.cc
 create mode 100644 test/performance/routines/level3/xgemm.cc
 create mode 100644 test/performance/routines/level3/xhemm.cc
 create mode 100644 test/performance/routines/level3/xher2k.cc
 create mode 100644 test/performance/routines/level3/xherk.cc
 create mode 100644 test/performance/routines/level3/xsymm.cc
 create mode 100644 test/performance/routines/level3/xsyr2k.cc
 create mode 100644 test/performance/routines/level3/xsyrk.cc
 create mode 100644 test/performance/routines/level3/xtrmm.cc
 delete mode 100644 test/performance/routines/xaxpy.cc
 delete mode 100644 test/performance/routines/xgemm.cc
 delete mode 100644 test/performance/routines/xgemv.cc
 delete mode 100644 test/performance/routines/xhemm.cc
 delete mode 100644 test/performance/routines/xher2k.cc
 delete mode 100644 test/performance/routines/xherk.cc
 delete mode 100644 test/performance/routines/xsymm.cc
 delete mode 100644 test/performance/routines/xsyr2k.cc
 delete mode 100644 test/performance/routines/xsyrk.cc
 delete mode 100644 test/performance/routines/xtrmm.cc
 create mode 100644 test/routines/level1/xaxpy.h
 create mode 100644 test/routines/level2/xgemv.h
 create mode 100644 test/routines/level3/xgemm.h
 create mode 100644 test/routines/level3/xhemm.h
 create mode 100644 test/routines/level3/xher2k.h
 create mode 100644 test/routines/level3/xherk.h
 create mode 100644 test/routines/level3/xsymm.h
 create mode 100644 test/routines/level3/xsyr2k.h
 create mode 100644 test/routines/level3/xsyrk.h
 create mode 100644 test/routines/level3/xtrmm.h
 delete mode 100644 test/routines/xaxpy.h
 delete mode 100644 test/routines/xgemm.h
 delete mode 100644 test/routines/xgemv.h
 delete mode 100644 test/routines/xhemm.h
 delete mode 100644 test/routines/xher2k.h
 delete mode 100644 test/routines/xherk.h
 delete mode 100644 test/routines/xsymm.h
 delete mode 100644 test/routines/xsyr2k.h
 delete mode 100644 test/routines/xsyrk.h
 delete mode 100644 test/routines/xtrmm.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b2c5657..c97ddd5b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,17 +95,23 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
 set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
 set(SAMPLE_PROGRAMS sgemm)
-set(ROUTINES
-  xaxpy
-  xgemv
-  xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
+set(LEVEL1_ROUTINES xaxpy)
+set(LEVEL2_ROUTINES xgemv)
+set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
+set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
 
 # ==================================================================================================
 
 # Gathers all source-files
 set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc)
-foreach(ROUTINE ${ROUTINES})
-  set(SOURCES ${SOURCES} src/routines/${ROUTINE}.cc)
+foreach(ROUTINE ${LEVEL1_ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
+endforeach()
+foreach(ROUTINE ${LEVEL2_ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc)
+endforeach()
+foreach(ROUTINE ${LEVEL3_ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc)
 endforeach()
 
 # Creates and links the library
@@ -172,10 +178,19 @@ if(TESTS)
               test/correctness/tester.cc test/correctness/testblas.cc)
 
   # Compiles the correctness-tests
+  foreach(ROUTINE ${LEVEL1_ROUTINES})
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level1/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${LEVEL2_ROUTINES})
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level2/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${LEVEL3_ROUTINES})
+    add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/level3/${ROUTINE}.cc)
+  endforeach()
   foreach(ROUTINE ${ROUTINES})
-    add_executable(test_${ROUTINE}
-                   $<TARGET_OBJECTS:test_correctness_common>
-                   test/correctness/routines/${ROUTINE}.cc)
     target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
     install(TARGETS test_${ROUTINE} DESTINATION bin)
   endforeach()
@@ -184,9 +199,19 @@ if(TESTS)
   add_library(test_performance_common OBJECT test/performance/client.cc)
 
   # Compiles the performance-tests
-  foreach(ROUTINE ${ROUTINES})
+  foreach(ROUTINE ${LEVEL1_ROUTINES})
     add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
-                   test/performance/routines/${ROUTINE}.cc)
+                   test/performance/routines/level1/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${LEVEL2_ROUTINES})
+    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
+                   test/performance/routines/level2/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${LEVEL3_ROUTINES})
+    add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
+                   test/performance/routines/level3/${ROUTINE}.cc)
+  endforeach()
+  foreach(ROUTINE ${ROUTINES})
     target_link_libraries(client_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
     install(TARGETS client_${ROUTINE} DESTINATION bin)
   endforeach()
diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h
new file mode 100644
index 00000000..e548e553
--- /dev/null
+++ b/include/internal/routines/level1/xaxpy.h
@@ -0,0 +1,42 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XAXPY_H_
+#define CLBLAST_ROUTINES_XAXPY_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xaxpy: public Routine {
+ public:
+  Xaxpy(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoAxpy(const size_t n, const T alpha,
+                    const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer &y_buffer, const size_t y_offset, const size_t y_inc);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XAXPY_H_
+#endif
diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h
new file mode 100644
index 00000000..a3109036
--- /dev/null
+++ b/include/internal/routines/level2/xgemv.h
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemv routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGEMV_H_
+#define CLBLAST_ROUTINES_XGEMV_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgemv: public Routine {
+ public:
+  Xgemv(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const T beta,
+                    const Buffer &y_buffer, const size_t y_offset, const size_t y_inc);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGEMV_H_
+#endif
diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h
new file mode 100644
index 00000000..7ad4fcfb
--- /dev/null
+++ b/include/internal/routines/level3/xgemm.h
@@ -0,0 +1,46 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XGEMM_H_
+#define CLBLAST_ROUTINES_XGEMM_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xgemm: public Routine {
+ public:
+  Xgemm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                    const size_t m, const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XGEMM_H_
+#endif
diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h
new file mode 100644
index 00000000..6cc9d9ec
--- /dev/null
+++ b/include/internal/routines/level3/xhemm.h
@@ -0,0 +1,58 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemm routine. It is based on the generalized matrix multiplication
+// routine (Xgemm). The implementation is very similar to the Xsymm routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHEMM_H_
+#define CLBLAST_ROUTINES_XHEMM_H_
+
+#include "internal/routines/level3/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xhemm: public Xgemm<T> {
+ public:
+
+  // Uses several variables from the Routine class
+  using Routine::db_;
+  using Routine::context_;
+
+  // Uses several helper functions from the Routine class
+  using Routine::RunKernel;
+  using Routine::ErrorIn;
+  using Routine::TestMatrixA;
+  using Routine::GetProgramFromCache;
+
+  // Uses the regular Xgemm routine
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xhemm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHEMM_H_
+#endif
diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h
new file mode 100644
index 00000000..1836a812
--- /dev/null
+++ b/include/internal/routines/level3/xher2k.h
@@ -0,0 +1,48 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2k routine. The precision is implemented using the template argument
+// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
+// Xsyr2k routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHER2K_H_
+#define CLBLAST_ROUTINES_XHER2K_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xher2k: public Routine {
+ public:
+  Xher2k(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                     const size_t n, const size_t k,
+                     const T alpha,
+                     const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                     const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                     const U beta,
+                     const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHER2K_H_
+#endif
diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h
new file mode 100644
index 00000000..9b361254
--- /dev/null
+++ b/include/internal/routines/level3/xherk.h
@@ -0,0 +1,47 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xherk routine. The precision is implemented using the template argument
+// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
+// Xsyrk routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XHERK_H_
+#define CLBLAST_ROUTINES_XHERK_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class Xherk: public Routine {
+ public:
+  Xherk(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                    const size_t n, const size_t k,
+                    const U alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const U beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XHERK_H_
+#endif
diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h
new file mode 100644
index 00000000..2028ceea
--- /dev/null
+++ b/include/internal/routines/level3/xsymm.h
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm routine. It is based on the generalized matrix multiplication
+// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the
+// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by
+// transforming it into a general matrix, and then calls the regular GEMM code.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYMM_H_
+#define CLBLAST_ROUTINES_XSYMM_H_
+
+#include "internal/routines/level3/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsymm: public Xgemm<T> {
+ public:
+
+  // Uses several variables from the Routine class
+  using Routine::db_;
+  using Routine::context_;
+
+  // Uses several helper functions from the Routine class
+  using Routine::RunKernel;
+  using Routine::ErrorIn;
+  using Routine::TestMatrixA;
+  using Routine::GetProgramFromCache;
+
+  // Uses the regular Xgemm routine
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xsymm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYMM_H_
+#endif
diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h
new file mode 100644
index 00000000..6259313c
--- /dev/null
+++ b/include/internal/routines/level3/xsyr2k.h
@@ -0,0 +1,48 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
+// The implementation is very similar to Xsyrk (see header for details), except for the fact that
+// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYR2K_H_
+#define CLBLAST_ROUTINES_XSYR2K_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyr2k: public Routine {
+ public:
+  Xsyr2k(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                     const size_t n, const size_t k,
+                     const T alpha,
+                     const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                     const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                     const T beta,
+                     const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYR2K_H_
+#endif
diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h
new file mode 100644
index 00000000..3dab731f
--- /dev/null
+++ b/include/internal/routines/level3/xsyrk.h
@@ -0,0 +1,49 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk routine. The precision is implemented using a template argument.
+// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
+// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
+// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
+//    performance reasons, as the actual masking is done later (see the first point).
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XSYRK_H_
+#define CLBLAST_ROUTINES_XSYRK_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xsyrk: public Routine {
+ public:
+  Xsyrk(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                    const size_t n, const size_t k,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const T beta,
+                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XSYRK_H_
+#endif
diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h
new file mode 100644
index 00000000..4f49bebd
--- /dev/null
+++ b/include/internal/routines/level3/xtrmm.h
@@ -0,0 +1,58 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmm routine. The implementation is based on first transforming the
+// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM
+// routine. Therefore, this class inherits from the Xgemm class.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRMM_H_
+#define CLBLAST_ROUTINES_XTRMM_H_
+
+#include "internal/routines/level3/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrmm: public Xgemm<T> {
+ public:
+
+  // Uses several variables from the Routine class
+  using Routine::db_;
+  using Routine::context_;
+
+  // Uses several helper functions from the Routine class
+  using Routine::RunKernel;
+  using Routine::ErrorIn;
+  using Routine::TestMatrixA;
+  using Routine::GetProgramFromCache;
+
+  // Uses the regular Xgemm routine
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xtrmm(CommandQueue &queue, Event &event);
+
+  // Templated-precision implementation of the routine
+  StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRMM_H_
+#endif
diff --git a/include/internal/routines/xaxpy.h b/include/internal/routines/xaxpy.h
deleted file mode 100644
index e548e553..00000000
--- a/include/internal/routines/xaxpy.h
+++ /dev/null
@@ -1,42 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xaxpy routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XAXPY_H_
-#define CLBLAST_ROUTINES_XAXPY_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xaxpy: public Routine {
- public:
-  Xaxpy(CommandQueue &queue, Event &event);
-
-  // Templated-precision implementation of the routine
-  StatusCode DoAxpy(const size_t n, const T alpha,
-                    const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer &y_buffer, const size_t y_offset, const size_t y_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XAXPY_H_
-#endif
diff --git a/include/internal/routines/xgemm.h b/include/internal/routines/xgemm.h
deleted file mode 100644
index 7ad4fcfb..00000000
--- a/include/internal/routines/xgemm.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemm routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XGEMM_H_
-#define CLBLAST_ROUTINES_XGEMM_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xgemm: public Routine {
- public:
-  Xgemm(CommandQueue &queue, Event &event);
-
-  // Templated-precision implementation of the routine
-  StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
-                    const size_t m, const size_t n, const size_t k,
-                    const T alpha,
-                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                    const T beta,
-                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XGEMM_H_
-#endif
diff --git a/include/internal/routines/xgemv.h b/include/internal/routines/xgemv.h
deleted file mode 100644
index a3109036..00000000
--- a/include/internal/routines/xgemv.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemv routine. The precision is implemented using a template argument.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XGEMV_H_
-#define CLBLAST_ROUTINES_XGEMV_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xgemv: public Routine {
- public:
-  Xgemv(CommandQueue &queue, Event &event);
-
-  // Templated-precision implementation of the routine
-  StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer &y_buffer, const size_t y_offset, const size_t y_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XGEMV_H_
-#endif
diff --git a/include/internal/routines/xhemm.h b/include/internal/routines/xhemm.h
deleted file mode 100644
index 1b1a0dfa..00000000
--- a/include/internal/routines/xhemm.h
+++ /dev/null
@@ -1,58 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xhemm routine. It is based on the generalized matrix multiplication
-// routine (Xgemm). The implementation is very similar to the Xsymm routine.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHEMM_H_
-#define CLBLAST_ROUTINES_XHEMM_H_
-
-#include "internal/routines/xgemm.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xhemm: public Xgemm<T> {
- public:
-
-  // Uses several variables from the Routine class
-  using Routine::db_;
-  using Routine::context_;
-
-  // Uses several helper functions from the Routine class
-  using Routine::RunKernel;
-  using Routine::ErrorIn;
-  using Routine::TestMatrixA;
-  using Routine::GetProgramFromCache;
-
-  // Uses the regular Xgemm routine
-  using Xgemm<T>::DoGemm;
-
-  // Constructor
-  Xhemm(CommandQueue &queue, Event &event);
-
-  // Templated-precision implementation of the routine
-  StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                    const T beta,
-                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHEMM_H_
-#endif
diff --git a/include/internal/routines/xher2k.h b/include/internal/routines/xher2k.h
deleted file mode 100644
index 1836a812..00000000
--- a/include/internal/routines/xher2k.h
+++ /dev/null
@@ -1,48 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xher2k routine. The precision is implemented using the template argument
-// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
-// Xsyr2k routine.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHER2K_H_
-#define CLBLAST_ROUTINES_XHER2K_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T, typename U>
-class Xher2k: public Routine {
- public:
-  Xher2k(CommandQueue &queue, Event &event);
-
-  // Templated-precision implementation of the routine
-  StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
-                     const size_t n, const size_t k,
-                     const T alpha,
-                     const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                     const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                     const U beta,
-                     const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHER2K_H_
-#endif
diff --git a/include/internal/routines/xherk.h b/include/internal/routines/xherk.h
deleted file mode 100644
index 9b361254..00000000
--- a/include/internal/routines/xherk.h
+++ /dev/null
@@ -1,47 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xherk routine. The precision is implemented using the template argument
-// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
-// Xsyrk routine.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XHERK_H_
-#define CLBLAST_ROUTINES_XHERK_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T, typename U>
-class Xherk: public Routine {
- public:
-  Xherk(CommandQueue &queue, Event &event);
-
-  // Templated-precision implementation of the routine
-  StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
-                    const size_t n, const size_t k,
-                    const U alpha,
-                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const U beta,
-                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XHERK_H_
-#endif
diff --git a/include/internal/routines/xsymm.h b/include/internal/routines/xsymm.h
deleted file mode 100644
index c6545164..00000000
--- a/include/internal/routines/xsymm.h
+++ /dev/null
@@ -1,60 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsymm routine. It is based on the generalized matrix multiplication
-// routine (Xgemm). The Xsymm class inherits from the templated class Xgemm, allowing it to call the
-// "DoGemm" function directly. The "DoSymm" function first preprocesses the symmetric matrix by
-// transforming it into a general matrix, and then calls the regular GEMM code.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSYMM_H_
-#define CLBLAST_ROUTINES_XSYMM_H_
-
-#include "internal/routines/xgemm.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xsymm: public Xgemm<T> {
- public:
-
-  // Uses several variables from the Routine class
-  using Routine::db_;
-  using Routine::context_;
-
-  // Uses several helper functions from the Routine class
-  using Routine::RunKernel;
-  using Routine::ErrorIn;
-  using Routine::TestMatrixA;
-  using Routine::GetProgramFromCache;
-
-  // Uses the regular Xgemm routine
-  using Xgemm<T>::DoGemm;
-
-  // Constructor
-  Xsymm(CommandQueue &queue, Event &event);
-
-  // Templated-precision implementation of the routine
-  StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                    const T beta,
-                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSYMM_H_
-#endif
diff --git a/include/internal/routines/xsyr2k.h b/include/internal/routines/xsyr2k.h
deleted file mode 100644
index 6259313c..00000000
--- a/include/internal/routines/xsyr2k.h
+++ /dev/null
@@ -1,48 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
-// The implementation is very similar to Xsyrk (see header for details), except for the fact that
-// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSYR2K_H_
-#define CLBLAST_ROUTINES_XSYR2K_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xsyr2k: public Routine {
- public:
-  Xsyr2k(CommandQueue &queue, Event &event);
-
-  // Templated-precision implementation of the routine
-  StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
-                     const size_t n, const size_t k,
-                     const T alpha,
-                     const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                     const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                     const T beta,
-                     const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSYR2K_H_
-#endif
diff --git a/include/internal/routines/xsyrk.h b/include/internal/routines/xsyrk.h
deleted file mode 100644
index 3dab731f..00000000
--- a/include/internal/routines/xsyrk.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyrk routine. The precision is implemented using a template argument.
-// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
-// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
-// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
-//    performance reasons, as the actual masking is done later (see the first point).
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XSYRK_H_
-#define CLBLAST_ROUTINES_XSYRK_H_
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xsyrk: public Routine {
- public:
-  Xsyrk(CommandQueue &queue, Event &event);
-
-  // Templated-precision implementation of the routine
-  StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
-                    const size_t n, const size_t k,
-                    const T alpha,
-                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const T beta,
-                    const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XSYRK_H_
-#endif
diff --git a/include/internal/routines/xtrmm.h b/include/internal/routines/xtrmm.h
deleted file mode 100644
index af9f0266..00000000
--- a/include/internal/routines/xtrmm.h
+++ /dev/null
@@ -1,58 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xtrmm routine. The implementation is based on first transforming the
-// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM
-// routine. Therefore, this class inherits from the Xgemm class.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINES_XTRMM_H_
-#define CLBLAST_ROUTINES_XTRMM_H_
-
-#include "internal/routines/xgemm.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Xtrmm: public Xgemm<T> {
- public:
-
-  // Uses several variables from the Routine class
-  using Routine::db_;
-  using Routine::context_;
-
-  // Uses several helper functions from the Routine class
-  using Routine::RunKernel;
-  using Routine::ErrorIn;
-  using Routine::TestMatrixA;
-  using Routine::GetProgramFromCache;
-
-  // Uses the regular Xgemm routine
-  using Xgemm<T>::DoGemm;
-
-  // Constructor
-  Xtrmm(CommandQueue &queue, Event &event);
-
-  // Templated-precision implementation of the routine
-  StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
-                    const Transpose a_transpose, const Diagonal diagonal,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer &b_buffer, const size_t b_offset, const size_t b_ld);
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINES_XTRMM_H_
-#endif
diff --git a/src/clblast.cc b/src/clblast.cc
index 23046b01..b5d53ee6 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -18,20 +18,20 @@
 #include "clblast.h"
 
 // BLAS level-1 includes
-#include "internal/routines/xaxpy.h"
+#include "internal/routines/level1/xaxpy.h"
 
 // BLAS level-2 includes
-#include "internal/routines/xgemv.h"
+#include "internal/routines/level2/xgemv.h"
 
 // BLAS level-3 includes
-#include "internal/routines/xgemm.h"
-#include "internal/routines/xsymm.h"
-#include "internal/routines/xhemm.h"
-#include "internal/routines/xsyrk.h"
-#include "internal/routines/xherk.h"
-#include "internal/routines/xsyr2k.h"
-#include "internal/routines/xher2k.h"
-#include "internal/routines/xtrmm.h"
+#include "internal/routines/level3/xgemm.h"
+#include "internal/routines/level3/xsymm.h"
+#include "internal/routines/level3/xhemm.h"
+#include "internal/routines/level3/xsyrk.h"
+#include "internal/routines/level3/xherk.h"
+#include "internal/routines/level3/xsyr2k.h"
+#include "internal/routines/level3/xher2k.h"
+#include "internal/routines/level3/xtrmm.h"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc
new file mode 100644
index 00000000..fba36851
--- /dev/null
+++ b/src/routines/level1/xaxpy.cc
@@ -0,0 +1,115 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level1/xaxpy.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
+template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
+template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Xaxpy"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
+                            const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { return StatusCode::kInvalidDimension; }
+
+  // Tests the vectors for validity
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines whether or not the fast-version can be used
+  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
+                         (y_offset == 0) && (y_inc == 1) &&
+                         IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+
+  // If possible, run the fast-version of the kernel
+  auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
+
+  // Retrieves the Xaxpy kernel from the compiled binary
+  try {
+    auto& program = GetProgramFromCache();
+    auto kernel = Kernel(program, kernel_name);
+
+    // Sets the kernel arguments
+    if (use_fast_kernel) {
+      kernel.SetArgument(0, static_cast<int>(n));
+      kernel.SetArgument(1, alpha);
+      kernel.SetArgument(2, x_buffer());
+      kernel.SetArgument(3, y_buffer());
+    }
+    else {
+      kernel.SetArgument(0, static_cast<int>(n));
+      kernel.SetArgument(1, alpha);
+      kernel.SetArgument(2, x_buffer());
+      kernel.SetArgument(3, static_cast<int>(x_offset));
+      kernel.SetArgument(4, static_cast<int>(x_inc));
+      kernel.SetArgument(5, y_buffer());
+      kernel.SetArgument(6, static_cast<int>(y_offset));
+      kernel.SetArgument(7, static_cast<int>(y_inc));
+    }
+
+    // Launches the kernel
+    if (use_fast_kernel) {
+      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+      auto local = std::vector<size_t>{db_["WGS"]};
+      status = RunKernel(kernel, global, local);
+    }
+    else {
+      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+      auto local = std::vector<size_t>{db_["WGS"]};
+      status = RunKernel(kernel, global, local);
+    }
+    if (ErrorIn(status)) { return status; }
+
+    // Waits for all kernels to finish
+    queue_.Finish();
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xaxpy<float>;
+template class Xaxpy<double>;
+template class Xaxpy<float2>;
+template class Xaxpy<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
new file mode 100644
index 00000000..181337b6
--- /dev/null
+++ b/src/routines/level2/xgemv.cc
@@ -0,0 +1,146 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xgemv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
+template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
+template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Xgemv"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrix has an alternative layout (row or column-major).
+  auto a_altlayout = (layout == Layout::kRowMajor);
+  auto a_one = (a_altlayout) ? n : m;
+  auto a_two = (a_altlayout) ? m : n;
+
+  // Swap m and n if the matrix is transposed
+  auto a_transposed = (a_transpose != Transpose::kNo);
+  auto m_real = (a_transposed) ? n : m;
+  auto n_real = (a_transposed) ? m : n;
+
+  // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
+  auto a_rotated = a_transposed ^ a_altlayout;
+
+  // In case of complex data-types, the transpose can also become a conjugate transpose
+  auto a_conjugate = (a_transpose == Transpose::kConjugate);
+
+  // Tests the matrix and the vectors for validity
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines whether or not the fast-version can be used
+  bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
+                         IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
+                         IsMultiple(n, db_["WGS2"]) &&
+                         IsMultiple(a_ld, db_["VW2"]);
+  bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
+                             IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
+                             IsMultiple(n, db_["WGS3"]) &&
+                             IsMultiple(a_ld, db_["VW3"]);
+
+  // If possible, run the fast-version (rotated or non-rotated) of the kernel
+  auto kernel_name = "Xgemv";
+  auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
+  auto global_size = m_ceiled / db_["WPT1"];
+  auto local_size = db_["WGS1"];
+  if (use_fast_kernel) {
+    kernel_name = "XgemvFast";
+    global_size = m_real / db_["WPT2"];
+    local_size = db_["WGS2"];
+  }
+  if (use_fast_kernel_rot) {
+    kernel_name = "XgemvFastRot";
+    global_size = m_real / db_["WPT3"];
+    local_size = db_["WGS3"];
+  }
+
+  // Retrieves the Xgemv kernel from the compiled binary
+  try {
+    auto& program = GetProgramFromCache();
+    auto kernel = Kernel(program, kernel_name);
+
+    // Sets the kernel arguments
+    kernel.SetArgument(0, static_cast<int>(m_real));
+    kernel.SetArgument(1, static_cast<int>(n_real));
+    kernel.SetArgument(2, alpha);
+    kernel.SetArgument(3, beta);
+    kernel.SetArgument(4, static_cast<int>(a_rotated));
+    kernel.SetArgument(5, a_buffer());
+    kernel.SetArgument(6, static_cast<int>(a_offset));
+    kernel.SetArgument(7, static_cast<int>(a_ld));
+    kernel.SetArgument(8, x_buffer());
+    kernel.SetArgument(9, static_cast<int>(x_offset));
+    kernel.SetArgument(10, static_cast<int>(x_inc));
+    kernel.SetArgument(11, y_buffer());
+    kernel.SetArgument(12, static_cast<int>(y_offset));
+    kernel.SetArgument(13, static_cast<int>(y_inc));
+    kernel.SetArgument(14, static_cast<int>(a_conjugate));
+
+    // Launches the kernel
+    auto global = std::vector<size_t>{global_size};
+    auto local = std::vector<size_t>{local_size};
+    status = RunKernel(kernel, global, local);
+    if (ErrorIn(status)) { return status; }
+
+    // Waits for all kernels to finish
+    queue_.Finish();
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgemv<float>;
+template class Xgemv<double>;
+template class Xgemv<float2>;
+template class Xgemv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
new file mode 100644
index 00000000..f4a9f737
--- /dev/null
+++ b/src/routines/level3/xgemm.cc
@@ -0,0 +1,172 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xgemm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
+template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
+template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgemm<T>::DoGemm(const Layout layout,
+                            const Transpose a_transpose, const Transpose b_transpose,
+                            const size_t m, const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed. Note
+  // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of
+  // col-major) to be transformed, so transposing requirements are not the same as whether or not
+  // the matrix is actually transposed in memory.
+  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
+  auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+  auto a_do_transpose =  a_rotated;
+  auto b_do_transpose = !b_rotated;
+  auto c_do_transpose =  c_rotated;
+
+  // In case of complex data-types, the transpose can also become a conjugate transpose
+  auto a_conjugate = (a_transpose == Transpose::kConjugate);
+  auto b_conjugate = (b_transpose == Transpose::kConjugate);
+
+  // Computes the first and second dimensions of the 3 matrices taking into account whether the
+  // matrices are rotated or not
+  auto a_one = (a_rotated) ? k : m;
+  auto a_two = (a_rotated) ? m : k;
+  auto b_one = (b_rotated) ? n : k;
+  auto b_two = (b_rotated) ? k : n;
+  auto c_one = (c_rotated) ? n : m;
+  auto c_two = (c_rotated) ? m : n;
+
+  // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than K when rotated, or less than M when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N when rotated, or less than M when not-rotated
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of m, n, and k
+  auto m_ceiled = Ceil(m, db_["MWG"]);
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
+    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
+    // them up until they reach a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                    m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
+                                    a_do_transpose, a_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
+                                    b_do_transpose, b_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Only necessary for matrix C if it used both as input and output
+    if (beta != static_cast<T>(0)) {
+      status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
+                                      m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
+                                      c_do_transpose, false, true, false, false, false, program);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Retrieves the Xgemm kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, "Xgemm");
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(m_ceiled));
+      kernel.SetArgument(1, static_cast<int>(n_ceiled));
+      kernel.SetArgument(2, static_cast<int>(k_ceiled));
+      kernel.SetArgument(3, alpha);
+      kernel.SetArgument(4, beta);
+      kernel.SetArgument(5, temp_a());
+      kernel.SetArgument(6, temp_b());
+      kernel.SetArgument(7, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (m_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
+                                      c_one, c_two, c_ld, c_offset, c_buffer,
+                                      c_do_transpose, false, false, false, false, false, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgemm<float>;
+template class Xgemm<double>;
+template class Xgemm<float2>;
+template class Xgemm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc
new file mode 100644
index 00000000..bc257c44
--- /dev/null
+++ b/src/routines/level3/xhemm.cc
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xhemm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhemm<T>::Xhemm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
+  // left) or B (on the right) in the Xgemm routine.
+  auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the squared A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
+
+  // Temporary buffer for a copy of the hermitian matrix
+  try {
+    auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto& program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the hermitian-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_herm());
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // hermitian-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "C := AB+C" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_herm, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+      }
+
+      // ... with "C := BA+C". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_herm, 0, k,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhemm<float2>;
+template class Xhemm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
new file mode 100644
index 00000000..6d33a0e1
--- /dev/null
+++ b/src/routines/level3/xher2k.cc
@@ -0,0 +1,178 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2k class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xher2k.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xher2k<float2,float>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xher2k<double2,double>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T, typename U>
+Xher2k<T,U>::Xher2k(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T, typename U>
+StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                                const size_t n, const size_t k,
+                                const T alpha,
+                                const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                                const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                                const U beta,
+                                const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
+  // to matrix A (argument: conjugate transpose)
+  auto ab_conjugate = (ab_transpose != Transpose::kNo);
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) ||
+                    (layout == Layout::kRowMajor && !ab_conjugate);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A and B matrices taking the layout into account
+  auto ab_one = (ab_rotated) ? k : n;
+  auto ab_two = (ab_rotated) ? n : k;
+
+  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_b1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_a2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_b2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+    // fill them up until they reach a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a1,
+                                    ab_rotated, ab_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a2,
+                                    ab_rotated, !ab_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b1,
+                                    ab_rotated, ab_conjugate, true, false, false, false, program);
+    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b2,
+                                    ab_rotated, !ab_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                    c_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      auto complex_beta = T{beta, static_cast<U>(0.0)};
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, complex_beta);
+      kernel.SetArgument(4, temp_a1());
+      kernel.SetArgument(5, temp_b2());
+      kernel.SetArgument(6, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
+      auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
+      auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
+      kernel.SetArgument(2, conjugate_alpha);
+      kernel.SetArgument(3, complex_one);
+      kernel.SetArgument(4, temp_b1());
+      kernel.SetArgument(5, temp_a2());
+
+      // Runs the kernel again
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      c_rotated, false, false, upper, lower, true, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xher2k<float2,float>;
+template class Xher2k<double2,double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
new file mode 100644
index 00000000..8fae294f
--- /dev/null
+++ b/src/routines/level3/xherk.cc
@@ -0,0 +1,156 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xherk class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xherk.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xherk<float2,float>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xherk<double2,double>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T, typename U>
+Xherk<T,U>::Xherk(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T, typename U>
+StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                              const size_t n, const size_t k,
+                              const U alpha,
+                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                              const U beta,
+                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
+  // to matrix A (argument: conjugate transpose)
+  auto a_conjugate = (a_transpose != Transpose::kNo);
+  auto b_conjugate = (a_transpose == Transpose::kNo);
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto a_rotated = (layout == Layout::kColMajor && a_conjugate) ||
+                   (layout == Layout::kRowMajor && !a_conjugate);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A matrix taking the layout into account
+  auto a_one = (a_rotated) ? k : n;
+  auto a_two = (a_rotated) ? n : k;
+
+  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to
+    // fill it up until it reaches a certain multiple of size (kernel parameter dependent). It
+    // creates two copies: 
+    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
+                                    a_rotated, a_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
+                                    a_rotated, b_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                    c_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      auto complex_alpha = T{alpha, static_cast<U>(0.0)};
+      auto complex_beta = T{beta, static_cast<U>(0.0)};
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, complex_alpha);
+      kernel.SetArgument(3, complex_beta);
+      kernel.SetArgument(4, temp_a());
+      kernel.SetArgument(5, temp_b());
+      kernel.SetArgument(6, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      c_rotated, false, false, upper, lower, true, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xherk<float2,float>;
+template class Xherk<double2,double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc
new file mode 100644
index 00000000..1d17f0eb
--- /dev/null
+++ b/src/routines/level3/xsymm.cc
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xsymm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsymm<T>::Xsymm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
+  // left) or B (on the right) in the Xgemm routine.
+  auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the squared A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
+
+  // Temporary buffer for a copy of the symmetric matrix
+  try {
+    auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto& program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the symmetric-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_symm());
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // symmetric-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "C := AB+C" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_symm, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+      }
+
+      // ... with "C := BA+C". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_symm, 0, k,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsymm<float>;
+template class Xsymm<double>;
+template class Xsymm<float2>;
+template class Xsymm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
new file mode 100644
index 00000000..d54f2fc1
--- /dev/null
+++ b/src/routines/level3/xsyr2k.cc
@@ -0,0 +1,166 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xsyr2k.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
+template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
+template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                              const size_t n, const size_t k,
+                              const T alpha,
+                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                              const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                              const T beta,
+                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
+                    (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A and B matrices taking the layout into account
+  auto ab_one = (ab_rotated) ? k : n;
+  auto ab_two = (ab_rotated) ? n : k;
+
+  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+    // fill them up until they reach a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
+                                    ab_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
+                                    ab_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                    c_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, beta);
+      kernel.SetArgument(4, temp_a());
+      kernel.SetArgument(5, temp_b());
+      kernel.SetArgument(6, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Swaps the arguments for matrices A and B, and sets 'beta' to 1
+      auto one = static_cast<T>(1);
+      kernel.SetArgument(3, one);
+      kernel.SetArgument(4, temp_b());
+      kernel.SetArgument(5, temp_a());
+
+      // Runs the kernel again
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      c_rotated, false, false, upper, lower, false, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyr2k<float>;
+template class Xsyr2k<double>;
+template class Xsyr2k<float2>;
+template class Xsyr2k<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
new file mode 100644
index 00000000..bb952410
--- /dev/null
+++ b/src/routines/level3/xsyrk.cc
@@ -0,0 +1,147 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xsyrk.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
+template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
+template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                            const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A matrix taking the layout into account
+  auto a_one = (a_rotated) ? k : n;
+  auto a_two = (a_rotated) ? n : k;
+
+  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to
+    // fill it up until it reaches a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
+                                    a_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                    c_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, beta);
+      kernel.SetArgument(4, temp_a());
+      kernel.SetArgument(5, temp_a());
+      kernel.SetArgument(6, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      c_rotated, false, false, upper, lower, false, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyrk<float>;
+template class Xsyrk<double>;
+template class Xsyrk<float2>;
+template class Xsyrk<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc
new file mode 100644
index 00000000..52f272e3
--- /dev/null
+++ b/src/routines/level3/xtrmm.cc
@@ -0,0 +1,135 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xtrmm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrmm<T>::Xtrmm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not matrix is A (on the left)
+  // or B (on the right) in the Xgemm routine.
+  auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the triangular A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared";
+
+  // Determines whether or not the triangular matrix is unit-diagonal
+  auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
+
+  // Temporary buffer for a copy of the triangular matrix
+  try {
+    auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto& program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the triangular-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_triangular());
+      kernel.SetArgument(8, static_cast<int>(unit_diagonal));
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // triangular-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "B := alpha*A*B" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, a_transpose, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_triangular, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        static_cast<T>(0.0),
+                        b_buffer, b_offset, b_ld);
+      }
+
+      // ... with "B := alpha*B*A". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, a_transpose,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_triangular, 0, k,
+                        static_cast<T>(0.0),
+                        b_buffer, b_offset, b_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrmm<float>;
+template class Xtrmm<double>;
+template class Xtrmm<float2>;
+template class Xtrmm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/xaxpy.cc b/src/routines/xaxpy.cc
deleted file mode 100644
index b68458da..00000000
--- a/src/routines/xaxpy.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xaxpy class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xaxpy.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
-template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
-template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Xaxpy"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
-                            const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
-
-  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
-
-  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines whether or not the fast-version can be used
-  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
-                         (y_offset == 0) && (y_inc == 1) &&
-                         IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
-
-  // If possible, run the fast-version of the kernel
-  auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
-
-  // Retrieves the Xaxpy kernel from the compiled binary
-  try {
-    auto& program = GetProgramFromCache();
-    auto kernel = Kernel(program, kernel_name);
-
-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, y_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, static_cast<int>(x_offset));
-      kernel.SetArgument(4, static_cast<int>(x_inc));
-      kernel.SetArgument(5, y_buffer());
-      kernel.SetArgument(6, static_cast<int>(y_offset));
-      kernel.SetArgument(7, static_cast<int>(y_inc));
-    }
-
-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Waits for all kernels to finish
-    queue_.Finish();
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xaxpy<float>;
-template class Xaxpy<double>;
-template class Xaxpy<float2>;
-template class Xaxpy<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc
deleted file mode 100644
index c8674282..00000000
--- a/src/routines/xgemm.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemm class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xgemm.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
-template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
-template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xgemm<T>::DoGemm(const Layout layout,
-                            const Transpose a_transpose, const Transpose b_transpose,
-                            const size_t m, const size_t n, const size_t k,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                            const T beta,
-                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; }
-
-  // Computes whether or not the matrices are transposed in memory. This is based on their layout
-  // (row or column-major) and whether or not they are requested to be pre-transposed. Note
-  // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of
-  // col-major) to be transformed, so transposing requirements are not the same as whether or not
-  // the matrix is actually transposed in memory.
-  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
-                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
-  auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
-                   (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
-  auto c_rotated = (layout == Layout::kRowMajor);
-  auto a_do_transpose =  a_rotated;
-  auto b_do_transpose = !b_rotated;
-  auto c_do_transpose =  c_rotated;
-
-  // In case of complex data-types, the transpose can also become a conjugate transpose
-  auto a_conjugate = (a_transpose == Transpose::kConjugate);
-  auto b_conjugate = (b_transpose == Transpose::kConjugate);
-
-  // Computes the first and second dimensions of the 3 matrices taking into account whether the
-  // matrices are rotated or not
-  auto a_one = (a_rotated) ? k : m;
-  auto a_two = (a_rotated) ? m : k;
-  auto b_one = (b_rotated) ? n : k;
-  auto b_two = (b_rotated) ? k : n;
-  auto c_one = (c_rotated) ? n : m;
-  auto c_two = (c_rotated) ? m : n;
-
-  // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
-  // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the
-  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
-  // space. Also tests that the leading dimensions of:
-  //    matrix A cannot be less than K when rotated, or less than M when not-rotated
-  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix C cannot be less than N when rotated, or less than M when not-rotated
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Calculates the ceiled versions of m, n, and k
-  auto m_ceiled = Ceil(m, db_["MWG"]);
-  auto n_ceiled = Ceil(n, db_["NWG"]);
-  auto k_ceiled = Ceil(k, db_["KWG"]);
-
-  // Allocates space on the device for padded and/or transposed input and output matrices.
-  try {
-    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
-    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
-
-    // Loads the program from the database
-    auto& program = GetProgramFromCache();
-
-    // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
-    // them up until they reach a certain multiple of size (kernel parameter dependent).
-    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
-                                    m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
-                                    a_do_transpose, a_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
-                                    b_do_transpose, b_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Only necessary for matrix C if it used both as input and output
-    if (beta != static_cast<T>(0)) {
-      status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
-                                      m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
-                                      c_do_transpose, false, true, false, false, false, program);
-      if (ErrorIn(status)) { return status; }
-    }
-
-    // Retrieves the Xgemm kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, "Xgemm");
-
-      // Sets the kernel arguments
-      kernel.SetArgument(0, static_cast<int>(m_ceiled));
-      kernel.SetArgument(1, static_cast<int>(n_ceiled));
-      kernel.SetArgument(2, static_cast<int>(k_ceiled));
-      kernel.SetArgument(3, alpha);
-      kernel.SetArgument(4, beta);
-      kernel.SetArgument(5, temp_a());
-      kernel.SetArgument(6, temp_b());
-      kernel.SetArgument(7, temp_c());
-
-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (m_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
-      // Launches the kernel
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the post-processing kernel
-      status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
-                                      c_one, c_two, c_ld, c_offset, c_buffer,
-                                      c_do_transpose, false, false, false, false, false, program);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xgemm<float>;
-template class Xgemm<double>;
-template class Xgemm<float2>;
-template class Xgemm<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xgemv.cc b/src/routines/xgemv.cc
deleted file mode 100644
index 1868dec4..00000000
--- a/src/routines/xgemv.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemv class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xgemv.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
-template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
-template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Xgemv"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
-
-  // Makes sure all dimensions are larger than zero
-  if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
-
-  // Computes whether or not the matrix has an alternative layout (row or column-major).
-  auto a_altlayout = (layout == Layout::kRowMajor);
-  auto a_one = (a_altlayout) ? n : m;
-  auto a_two = (a_altlayout) ? m : n;
-
-  // Swap m and n if the matrix is transposed
-  auto a_transposed = (a_transpose != Transpose::kNo);
-  auto m_real = (a_transposed) ? n : m;
-  auto n_real = (a_transposed) ? m : n;
-
-  // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
-  auto a_rotated = a_transposed ^ a_altlayout;
-
-  // In case of complex data-types, the transpose can also become a conjugate transpose
-  auto a_conjugate = (a_transpose == Transpose::kConjugate);
-
-  // Tests the matrix and the vectors for validity
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines whether or not the fast-version can be used
-  bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
-                         IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
-                         IsMultiple(n, db_["WGS2"]) &&
-                         IsMultiple(a_ld, db_["VW2"]);
-  bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
-                             IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
-                             IsMultiple(n, db_["WGS3"]) &&
-                             IsMultiple(a_ld, db_["VW3"]);
-
-  // If possible, run the fast-version (rotated or non-rotated) of the kernel
-  auto kernel_name = "Xgemv";
-  auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
-  auto global_size = m_ceiled / db_["WPT1"];
-  auto local_size = db_["WGS1"];
-  if (use_fast_kernel) {
-    kernel_name = "XgemvFast";
-    global_size = m_real / db_["WPT2"];
-    local_size = db_["WGS2"];
-  }
-  if (use_fast_kernel_rot) {
-    kernel_name = "XgemvFastRot";
-    global_size = m_real / db_["WPT3"];
-    local_size = db_["WGS3"];
-  }
-
-  // Retrieves the Xgemv kernel from the compiled binary
-  try {
-    auto& program = GetProgramFromCache();
-    auto kernel = Kernel(program, kernel_name);
-
-    // Sets the kernel arguments
-    kernel.SetArgument(0, static_cast<int>(m_real));
-    kernel.SetArgument(1, static_cast<int>(n_real));
-    kernel.SetArgument(2, alpha);
-    kernel.SetArgument(3, beta);
-    kernel.SetArgument(4, static_cast<int>(a_rotated));
-    kernel.SetArgument(5, a_buffer());
-    kernel.SetArgument(6, static_cast<int>(a_offset));
-    kernel.SetArgument(7, static_cast<int>(a_ld));
-    kernel.SetArgument(8, x_buffer());
-    kernel.SetArgument(9, static_cast<int>(x_offset));
-    kernel.SetArgument(10, static_cast<int>(x_inc));
-    kernel.SetArgument(11, y_buffer());
-    kernel.SetArgument(12, static_cast<int>(y_offset));
-    kernel.SetArgument(13, static_cast<int>(y_inc));
-    kernel.SetArgument(14, static_cast<int>(a_conjugate));
-
-    // Launches the kernel
-    auto global = std::vector<size_t>{global_size};
-    auto local = std::vector<size_t>{local_size};
-    status = RunKernel(kernel, global, local);
-    if (ErrorIn(status)) { return status; }
-
-    // Waits for all kernels to finish
-    queue_.Finish();
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xgemv<float>;
-template class Xgemv<double>;
-template class Xgemv<float2>;
-template class Xgemv<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xhemm.cc b/src/routines/xhemm.cc
deleted file mode 100644
index 73f769ed..00000000
--- a/src/routines/xhemm.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xhemm class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xhemm.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xhemm<T>::Xhemm(CommandQueue &queue, Event &event):
-    Xgemm<T>(queue, event) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                            const T beta,
-                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
-  // left) or B (on the right) in the Xgemm routine.
-  auto k = (side == Side::kLeft) ? m : n;
-
-  // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
-  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
-
-  // Temporary buffer for a copy of the hermitian matrix
-  try {
-    auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
-
-    // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
-    // routine afterwards
-    try {
-      auto& program = GetProgramFromCache();
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the hermitian-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(k));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(k));
-      kernel.SetArgument(5, static_cast<int>(k));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_herm());
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // hermitian-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the regular Xgemm code with either "C := AB+C" or ...
-      if (side == Side::kLeft) {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        temp_herm, 0, k,
-                        b_buffer, b_offset, b_ld,
-                        beta,
-                        c_buffer, c_offset, c_ld);
-      }
-
-      // ... with "C := BA+C". Note that A and B are now reversed.
-      else {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        b_buffer, b_offset, b_ld,
-                        temp_herm, 0, k,
-                        beta,
-                        c_buffer, c_offset, c_ld);
-
-        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
-        switch(status) {
-          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
-          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
-          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
-          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
-          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
-          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
-        }
-      }
-
-      // Return the status of the Xgemm routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xhemm<float2>;
-template class Xhemm<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xher2k.cc b/src/routines/xher2k.cc
deleted file mode 100644
index b19b743b..00000000
--- a/src/routines/xher2k.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xher2k class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xher2k.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xher2k<float2,float>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xher2k<double2,double>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T, typename U>
-Xher2k<T,U>::Xher2k(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T, typename U>
-StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
-                                const size_t n, const size_t k,
-                                const T alpha,
-                                const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                                const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                                const U beta,
-                                const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
-  // to matrix A (argument: conjugate transpose)
-  auto ab_conjugate = (ab_transpose != Transpose::kNo);
-
-  // Computes whether or not the matrices are transposed in memory. This is based on their layout
-  // (row or column-major) and whether or not they are requested to be pre-transposed.
-  auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) ||
-                    (layout == Layout::kRowMajor && !ab_conjugate);
-  auto c_rotated = (layout == Layout::kRowMajor);
-
-  // Computes the first and second dimensions of the A and B matrices taking the layout into account
-  auto ab_one = (ab_rotated) ? k : n;
-  auto ab_two = (ab_rotated) ? n : k;
-
-  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
-  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
-  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
-  // space. Also tests that the leading dimensions of:
-  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix C cannot be less than N
-  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Calculates the ceiled versions of n and k
-  auto n_ceiled = Ceil(n, db_["NWG"]);
-  auto k_ceiled = Ceil(k, db_["KWG"]);
-
-  // Decides which kernel to run: the upper-triangular or lower-triangular version
-  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
-
-  // Allocates space on the device for padded and/or transposed input and output matrices.
-  try {
-    auto temp_a1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_b1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_a2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_b2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
-
-    // Loads the program from the database
-    auto& program = GetProgramFromCache();
-
-    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
-    // fill them up until they reach a certain multiple of size (kernel parameter dependent).
-    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a1,
-                                    ab_rotated, ab_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a2,
-                                    ab_rotated, !ab_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b1,
-                                    ab_rotated, ab_conjugate, true, false, false, false, program);
-    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b2,
-                                    ab_rotated, !ab_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                    c_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the kernel arguments
-      auto complex_beta = T{beta, static_cast<U>(0.0)};
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha);
-      kernel.SetArgument(3, complex_beta);
-      kernel.SetArgument(4, temp_a1());
-      kernel.SetArgument(5, temp_b2());
-      kernel.SetArgument(6, temp_c());
-
-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
-      // Launches the kernel
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
-      auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
-      auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
-      kernel.SetArgument(2, conjugate_alpha);
-      kernel.SetArgument(3, complex_one);
-      kernel.SetArgument(4, temp_b1());
-      kernel.SetArgument(5, temp_a2());
-
-      // Runs the kernel again
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      c_rotated, false, false, upper, lower, true, program);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xher2k<float2,float>;
-template class Xher2k<double2,double>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xherk.cc b/src/routines/xherk.cc
deleted file mode 100644
index 6bc9cd6c..00000000
--- a/src/routines/xherk.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xherk class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xherk.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xherk<float2,float>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xherk<double2,double>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T, typename U>
-Xherk<T,U>::Xherk(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T, typename U>
-StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
-                              const size_t n, const size_t k,
-                              const U alpha,
-                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                              const U beta,
-                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
-  // to matrix A (argument: conjugate transpose)
-  auto a_conjugate = (a_transpose != Transpose::kNo);
-  auto b_conjugate = (a_transpose == Transpose::kNo);
-
-  // Computes whether or not the matrices are transposed in memory. This is based on their layout
-  // (row or column-major) and whether or not they are requested to be pre-transposed.
-  auto a_rotated = (layout == Layout::kColMajor && a_conjugate) ||
-                   (layout == Layout::kRowMajor && !a_conjugate);
-  auto c_rotated = (layout == Layout::kRowMajor);
-
-  // Computes the first and second dimensions of the A matrix taking the layout into account
-  auto a_one = (a_rotated) ? k : n;
-  auto a_two = (a_rotated) ? n : k;
-
-  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
-  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
-  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
-  // space. Also tests that the leading dimensions of:
-  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix C cannot be less than N
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Calculates the ceiled versions of n and k
-  auto n_ceiled = Ceil(n, db_["NWG"]);
-  auto k_ceiled = Ceil(k, db_["KWG"]);
-
-  // Decides which kernel to run: the upper-triangular or lower-triangular version
-  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
-
-  // Allocates space on the device for padded and/or transposed input and output matrices.
-  try {
-    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
-
-    // Loads the program from the database
-    auto& program = GetProgramFromCache();
-
-    // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to
-    // fill it up until it reaches a certain multiple of size (kernel parameter dependent). It
-    // creates two copies: 
-    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
-                                    a_rotated, a_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
-                                    a_rotated, b_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                    c_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the kernel arguments
-      auto complex_alpha = T{alpha, static_cast<U>(0.0)};
-      auto complex_beta = T{beta, static_cast<U>(0.0)};
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, complex_alpha);
-      kernel.SetArgument(3, complex_beta);
-      kernel.SetArgument(4, temp_a());
-      kernel.SetArgument(5, temp_b());
-      kernel.SetArgument(6, temp_c());
-
-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
-      // Launches the kernel
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      c_rotated, false, false, upper, lower, true, program);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xherk<float2,float>;
-template class Xherk<double2,double>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xsymm.cc b/src/routines/xsymm.cc
deleted file mode 100644
index b39eb24d..00000000
--- a/src/routines/xsymm.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsymm class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xsymm.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xsymm<T>::Xsymm(CommandQueue &queue, Event &event):
-    Xgemm<T>(queue, event) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                            const T beta,
-                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
-  // left) or B (on the right) in the Xgemm routine.
-  auto k = (side == Side::kLeft) ? m : n;
-
-  // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
-  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
-
-  // Temporary buffer for a copy of the symmetric matrix
-  try {
-    auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
-
-    // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
-    // routine afterwards
-    try {
-      auto& program = GetProgramFromCache();
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the symmetric-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(k));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(k));
-      kernel.SetArgument(5, static_cast<int>(k));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_symm());
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // symmetric-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the regular Xgemm code with either "C := AB+C" or ...
-      if (side == Side::kLeft) {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        temp_symm, 0, k,
-                        b_buffer, b_offset, b_ld,
-                        beta,
-                        c_buffer, c_offset, c_ld);
-      }
-
-      // ... with "C := BA+C". Note that A and B are now reversed.
-      else {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        b_buffer, b_offset, b_ld,
-                        temp_symm, 0, k,
-                        beta,
-                        c_buffer, c_offset, c_ld);
-
-        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
-        switch(status) {
-          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
-          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
-          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
-          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
-          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
-          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
-        }
-      }
-
-      // Return the status of the Xgemm routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xsymm<float>;
-template class Xsymm<double>;
-template class Xsymm<float2>;
-template class Xsymm<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xsyr2k.cc b/src/routines/xsyr2k.cc
deleted file mode 100644
index abb8b7eb..00000000
--- a/src/routines/xsyr2k.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyr2k class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xsyr2k.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
-template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
-template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
-                              const size_t n, const size_t k,
-                              const T alpha,
-                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                              const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                              const T beta,
-                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Computes whether or not the matrices are transposed in memory. This is based on their layout
-  // (row or column-major) and whether or not they are requested to be pre-transposed.
-  auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
-                    (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
-  auto c_rotated = (layout == Layout::kRowMajor);
-
-  // Computes the first and second dimensions of the A and B matrices taking the layout into account
-  auto ab_one = (ab_rotated) ? k : n;
-  auto ab_two = (ab_rotated) ? n : k;
-
-  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
-  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
-  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
-  // space. Also tests that the leading dimensions of:
-  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix C cannot be less than N
-  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Calculates the ceiled versions of n and k
-  auto n_ceiled = Ceil(n, db_["NWG"]);
-  auto k_ceiled = Ceil(k, db_["KWG"]);
-
-  // Decides which kernel to run: the upper-triangular or lower-triangular version
-  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
-
-  // Allocates space on the device for padded and/or transposed input and output matrices.
-  try {
-    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
-
-    // Loads the program from the database
-    auto& program = GetProgramFromCache();
-
-    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
-    // fill them up until they reach a certain multiple of size (kernel parameter dependent).
-    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
-                                    ab_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
-                                    ab_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                    c_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the kernel arguments
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha);
-      kernel.SetArgument(3, beta);
-      kernel.SetArgument(4, temp_a());
-      kernel.SetArgument(5, temp_b());
-      kernel.SetArgument(6, temp_c());
-
-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
-      // Launches the kernel
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Swaps the arguments for matrices A and B, and sets 'beta' to 1
-      auto one = static_cast<T>(1);
-      kernel.SetArgument(3, one);
-      kernel.SetArgument(4, temp_b());
-      kernel.SetArgument(5, temp_a());
-
-      // Runs the kernel again
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      c_rotated, false, false, upper, lower, false, program);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xsyr2k<float>;
-template class Xsyr2k<double>;
-template class Xsyr2k<float2>;
-template class Xsyr2k<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xsyrk.cc b/src/routines/xsyrk.cc
deleted file mode 100644
index 3efa0598..00000000
--- a/src/routines/xsyrk.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyrk class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xsyrk.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
-template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
-template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
-                            const size_t n, const size_t k,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const T beta,
-                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Computes whether or not the matrices are transposed in memory. This is based on their layout
-  // (row or column-major) and whether or not they are requested to be pre-transposed.
-  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
-                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
-  auto c_rotated = (layout == Layout::kRowMajor);
-
-  // Computes the first and second dimensions of the A matrix taking the layout into account
-  auto a_one = (a_rotated) ? k : n;
-  auto a_two = (a_rotated) ? n : k;
-
-  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
-  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
-  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
-  // space. Also tests that the leading dimensions of:
-  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix C cannot be less than N
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Calculates the ceiled versions of n and k
-  auto n_ceiled = Ceil(n, db_["NWG"]);
-  auto k_ceiled = Ceil(k, db_["KWG"]);
-
-  // Decides which kernel to run: the upper-triangular or lower-triangular version
-  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
-
-  // Allocates space on the device for padded and/or transposed input and output matrices.
-  try {
-    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
-
-    // Loads the program from the database
-    auto& program = GetProgramFromCache();
-
-    // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to
-    // fill it up until it reaches a certain multiple of size (kernel parameter dependent).
-    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
-                                    a_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                    c_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the kernel arguments
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha);
-      kernel.SetArgument(3, beta);
-      kernel.SetArgument(4, temp_a());
-      kernel.SetArgument(5, temp_a());
-      kernel.SetArgument(6, temp_c());
-
-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
-      // Launches the kernel
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      c_rotated, false, false, upper, lower, false, program);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xsyrk<float>;
-template class Xsyrk<double>;
-template class Xsyrk<float2>;
-template class Xsyrk<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xtrmm.cc b/src/routines/xtrmm.cc
deleted file mode 100644
index 543df844..00000000
--- a/src/routines/xtrmm.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xtrmm class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xtrmm.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xtrmm<T>::Xtrmm(CommandQueue &queue, Event &event):
-    Xgemm<T>(queue, event) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
-                            const Transpose a_transpose, const Diagonal diagonal,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
-
-  // Computes the k dimension. This is based on whether or not matrix is A (on the left)
-  // or B (on the right) in the Xgemm routine.
-  auto k = (side == Side::kLeft) ? m : n;
-
-  // Checks for validity of the triangular A matrix
-  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
-  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared";
-
-  // Determines whether or not the triangular matrix is unit-diagonal
-  auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
-
-  // Temporary buffer for a copy of the triangular matrix
-  try {
-    auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
-
-    // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
-    // routine afterwards
-    try {
-      auto& program = GetProgramFromCache();
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the triangular-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(k));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(k));
-      kernel.SetArgument(5, static_cast<int>(k));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_triangular());
-      kernel.SetArgument(8, static_cast<int>(unit_diagonal));
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // triangular-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the regular Xgemm code with either "B := alpha*A*B" or ...
-      if (side == Side::kLeft) {
-        status = DoGemm(layout, a_transpose, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        temp_triangular, 0, k,
-                        b_buffer, b_offset, b_ld,
-                        static_cast<T>(0.0),
-                        b_buffer, b_offset, b_ld);
-      }
-
-      // ... with "B := alpha*B*A". Note that A and B are now reversed.
-      else {
-        status = DoGemm(layout, Transpose::kNo, a_transpose,
-                        m, n, k,
-                        alpha,
-                        b_buffer, b_offset, b_ld,
-                        temp_triangular, 0, k,
-                        static_cast<T>(0.0),
-                        b_buffer, b_offset, b_ld);
-
-        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
-        switch(status) {
-          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
-          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
-          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
-          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
-          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
-          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
-        }
-      }
-
-      // Return the status of the Xgemm routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xtrmm<float>;
-template class Xtrmm<double>;
-template class Xtrmm<float2>;
-template class Xtrmm<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/test/correctness/routines/level1/xaxpy.cc b/test/correctness/routines/level1/xaxpy.cc
new file mode 100644
index 00000000..ac44caec
--- /dev/null
+++ b/test/correctness/routines/level1/xaxpy.cc
@@ -0,0 +1,81 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xaxpy routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level1/xaxpy.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXaxpy<T>::GetOptions(),
+                       TestXaxpy<T>::RunRoutine, TestXaxpy<T>::RunReference,
+                       TestXaxpy<T>::DownloadResult, TestXaxpy<T>::GetResultIndex,
+                       TestXaxpy<T>::ResultID1, TestXaxpy<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Creates the arguments vector for the regular tests
+  auto regular_test_vector = std::vector<Arguments<T>>{};
+  for (auto &n: tester.kVectorDims) { args.n = n;
+    for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
+      for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
+        for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
+          for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
+            for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+              args.x_size = TestXaxpy<T>::GetSizeX(args);
+              args.y_size = TestXaxpy<T>::GetSizeY(args);
+              if (args.x_size<1 || args.y_size<1) { continue; }
+              regular_test_vector.push_back(args);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Creates the arguments vector for the invalid-buffer tests
+  auto invalid_test_vector = std::vector<Arguments<T>>{};
+  args.n = tester.kBufferSize;
+  args.x_inc = args.y_inc = 1;
+  args.x_offset = args.y_offset = 0;
+  for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
+    for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
+      invalid_test_vector.push_back(args);
+    }
+  }
+
+  // Runs the tests
+  const auto case_name = "default";
+  tester.TestRegular(regular_test_vector, case_name);
+  tester.TestInvalid(invalid_test_vector, case_name);
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SAXPY");
+  clblast::RunTest<double>(argc, argv, true, "DAXPY");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CAXPY");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZAXPY");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xgemv.cc b/test/correctness/routines/level2/xgemv.cc
new file mode 100644
index 00000000..4e6942cc
--- /dev/null
+++ b/test/correctness/routines/level2/xgemv.cc
@@ -0,0 +1,99 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xgemv routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level2/xgemv.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXgemv<T>::GetOptions(),
+                       TestXgemv<T>::RunRoutine, TestXgemv<T>::RunReference,
+                       TestXgemv<T>::DownloadResult, TestXgemv<T>::GetResultIndex,
+                       TestXgemv<T>::ResultID1, TestXgemv<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
+
+      // Creates the arguments vector for the regular tests
+      auto regular_test_vector = std::vector<Arguments<T>>{};
+      for (auto &m: tester.kMatrixVectorDims) { args.m = m;
+        for (auto &n: tester.kMatrixVectorDims) { args.n = n;
+          for (auto &a_ld: tester.kMatrixVectorDims) { args.a_ld = a_ld;
+            for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+              for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
+                for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
+                  for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
+                    for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
+                      for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                        for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                          args.a_size = TestXgemv<T>::GetSizeA(args);
+                          args.x_size = TestXgemv<T>::GetSizeX(args);
+                          args.y_size = TestXgemv<T>::GetSizeY(args);
+                          if (args.a_size<1 || args.x_size<1 || args.y_size<1) { continue; }
+                          regular_test_vector.push_back(args);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+
+      // Creates the arguments vector for the invalid-buffer tests
+      auto invalid_test_vector = std::vector<Arguments<T>>{};
+      args.m = args.n = tester.kBufferSize;
+      args.a_ld = tester.kBufferSize;
+      args.x_inc = args.y_inc = 1;
+      args.a_offset = args.x_offset = args.y_offset = 0;
+      for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+        for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
+          for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
+            invalid_test_vector.push_back(args);
+          }
+        }
+      }
+
+      // Runs the tests
+      const auto case_name = ToString(layout)+" "+ToString(a_transpose);
+      tester.TestRegular(regular_test_vector, case_name);
+      tester.TestInvalid(invalid_test_vector, case_name);
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SGEMV");
+  clblast::RunTest<double>(argc, argv, true, "DGEMV");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMV");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMV");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xgemm.cc b/test/correctness/routines/level3/xgemm.cc
new file mode 100644
index 00000000..c1ce8fe2
--- /dev/null
+++ b/test/correctness/routines/level3/xgemm.cc
@@ -0,0 +1,102 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xgemm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xgemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXgemm<T>::GetOptions(),
+                       TestXgemm<T>::RunRoutine, TestXgemm<T>::RunReference,
+                       TestXgemm<T>::DownloadResult, TestXgemm<T>::GetResultIndex,
+                       TestXgemm<T>::ResultID1, TestXgemm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
+      for (auto &b_transpose: tester.kTransposes) { args.b_transpose = b_transpose;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &m: tester.kMatrixDims) { args.m = m;
+          for (auto &n: tester.kMatrixDims) { args.n = n;
+            for (auto &k: tester.kMatrixDims) { args.k = k;
+              for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+                for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                  for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                    for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                      for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                        for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                          for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                            for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                              args.a_size = TestXgemm<T>::GetSizeA(args);
+                              args.b_size = TestXgemm<T>::GetSizeB(args);
+                              args.c_size = TestXgemm<T>::GetSizeC(args);
+                              if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                              regular_test_vector.push_back(args);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.m = args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SGEMM");
+  clblast::RunTest<double>(argc, argv, true, "DGEMM");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMM");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xhemm.cc b/test/correctness/routines/level3/xhemm.cc
new file mode 100644
index 00000000..4d66a57f
--- /dev/null
+++ b/test/correctness/routines/level3/xhemm.cc
@@ -0,0 +1,98 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xhemm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xhemm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXhemm<T>::GetOptions(),
+                       TestXhemm<T>::RunRoutine, TestXhemm<T>::RunReference,
+                       TestXhemm<T>::DownloadResult, TestXhemm<T>::GetResultIndex,
+                       TestXhemm<T>::ResultID1, TestXhemm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &side: tester.kSides) { args.side = side;
+      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &m: tester.kMatrixDims) { args.m = m;
+          for (auto &n: tester.kMatrixDims) { args.n = n;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXhemm<T>::GetSizeA(args);
+                            args.b_size = TestXhemm<T>::GetSizeB(args);
+                            args.c_size = TestXhemm<T>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.m = args.n = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CHEMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZHEMM");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xher2k.cc b/test/correctness/routines/level3/xher2k.cc
new file mode 100644
index 00000000..ba5260fb
--- /dev/null
+++ b/test/correctness/routines/level3/xher2k.cc
@@ -0,0 +1,100 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xher2k routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xher2k.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T, typename U>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,U> tester{argc, argv, silent, name, TestXher2k<T,U>::GetOptions(),
+                       TestXher2k<T,U>::RunRoutine, TestXher2k<T,U>::RunReference,
+                       TestXher2k<T,U>::DownloadResult, TestXher2k<T,U>::GetResultIndex,
+                       TestXher2k<T,U>::ResultID1, TestXher2k<T,U>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<U>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &ab_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
+        args.a_transpose = ab_transpose;                                  // valid BLAS option
+        args.b_transpose = ab_transpose;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<U>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXher2k<T,U>::GetSizeA(args);
+                            args.b_size = TestXher2k<T,U>::GetSizeB(args);
+                            args.c_size = TestXher2k<T,U>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<U>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHER2K");
+  clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHER2K");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xherk.cc b/test/correctness/routines/level3/xherk.cc
new file mode 100644
index 00000000..7a4a7278
--- /dev/null
+++ b/test/correctness/routines/level3/xherk.cc
@@ -0,0 +1,92 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xherk routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xherk.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T, typename U>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,U> tester{argc, argv, silent, name, TestXherk<T,U>::GetOptions(),
+                       TestXherk<T,U>::RunRoutine, TestXherk<T,U>::RunReference,
+                       TestXherk<T,U>::DownloadResult, TestXherk<T,U>::GetResultIndex,
+                       TestXherk<T,U>::ResultID1, TestXherk<T,U>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<U>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &a_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
+        args.a_transpose = a_transpose;                                  // valid BLAS option
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<U>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                  for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                    for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                      for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                        args.a_size = TestXherk<T,U>::GetSizeA(args);
+                        args.c_size = TestXherk<T,U>::GetSizeC(args);
+                        if (args.a_size<1 || args.c_size<1) { continue; }
+                        regular_test_vector.push_back(args);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<U>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+            invalid_test_vector.push_back(args);
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHERK");
+  clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHERK");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xsymm.cc b/test/correctness/routines/level3/xsymm.cc
new file mode 100644
index 00000000..851efff2
--- /dev/null
+++ b/test/correctness/routines/level3/xsymm.cc
@@ -0,0 +1,100 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsymm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xsymm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXsymm<T>::GetOptions(),
+                       TestXsymm<T>::RunRoutine, TestXsymm<T>::RunReference,
+                       TestXsymm<T>::DownloadResult, TestXsymm<T>::GetResultIndex,
+                       TestXsymm<T>::ResultID1, TestXsymm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &side: tester.kSides) { args.side = side;
+      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &m: tester.kMatrixDims) { args.m = m;
+          for (auto &n: tester.kMatrixDims) { args.n = n;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXsymm<T>::GetSizeA(args);
+                            args.b_size = TestXsymm<T>::GetSizeB(args);
+                            args.c_size = TestXsymm<T>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.m = args.n = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SSYMM");
+  clblast::RunTest<double>(argc, argv, true, "DSYMM");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYMM");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xsyr2k.cc b/test/correctness/routines/level3/xsyr2k.cc
new file mode 100644
index 00000000..61ea59a3
--- /dev/null
+++ b/test/correctness/routines/level3/xsyr2k.cc
@@ -0,0 +1,102 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsyr2k routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xsyr2k.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXsyr2k<T>::GetOptions(),
+                       TestXsyr2k<T>::RunRoutine, TestXsyr2k<T>::RunReference,
+                       TestXsyr2k<T>::DownloadResult, TestXsyr2k<T>::GetResultIndex,
+                       TestXsyr2k<T>::ResultID1, TestXsyr2k<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
+        args.a_transpose = ab_transpose;                            // is not supported by clBLAS
+        args.b_transpose = ab_transpose;
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                            args.a_size = TestXsyr2k<T>::GetSizeA(args);
+                            args.b_size = TestXsyr2k<T>::GetSizeB(args);
+                            args.c_size = TestXsyr2k<T>::GetSizeC(args);
+                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
+                            regular_test_vector.push_back(args);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.b_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+              invalid_test_vector.push_back(args);
+            }
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SSYR2K");
+  clblast::RunTest<double>(argc, argv, true, "DSYR2K");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYR2K");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYR2K");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xsyrk.cc b/test/correctness/routines/level3/xsyrk.cc
new file mode 100644
index 00000000..126e201b
--- /dev/null
+++ b/test/correctness/routines/level3/xsyrk.cc
@@ -0,0 +1,94 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xsyrk routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xsyrk.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXsyrk<T>::GetOptions(),
+                       TestXsyrk<T>::RunRoutine, TestXsyrk<T>::RunReference,
+                       TestXsyrk<T>::DownloadResult, TestXsyrk<T>::GetResultIndex,
+                       TestXsyrk<T>::ResultID1, TestXsyrk<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+      for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
+        args.a_transpose = a_transpose;                            // is not supported by clBLAS
+
+        // Creates the arguments vector for the regular tests
+        auto regular_test_vector = std::vector<Arguments<T>>{};
+        for (auto &n: tester.kMatrixDims) { args.n = n;
+          for (auto &k: tester.kMatrixDims) { args.k = k;
+            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
+                  for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
+                    for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                      for (auto &beta: tester.kBetaValues) { args.beta = beta;
+                        args.a_size = TestXsyrk<T>::GetSizeA(args);
+                        args.c_size = TestXsyrk<T>::GetSizeC(args);
+                        if (args.a_size<1 || args.c_size<1) { continue; }
+                        regular_test_vector.push_back(args);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Creates the arguments vector for the invalid-buffer tests
+        auto invalid_test_vector = std::vector<Arguments<T>>{};
+        args.n = args.k = tester.kBufferSize;
+        args.a_ld = args.c_ld = tester.kBufferSize;
+        args.a_offset = args.c_offset = 0;
+        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+          for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
+            invalid_test_vector.push_back(args);
+          }
+        }
+
+        // Runs the tests
+        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
+        tester.TestRegular(regular_test_vector, case_name);
+        tester.TestInvalid(invalid_test_vector, case_name);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "SSYRK");
+  clblast::RunTest<double>(argc, argv, true, "DSYRK");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYRK");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYRK");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xtrmm.cc b/test/correctness/routines/level3/xtrmm.cc
new file mode 100644
index 00000000..5f04bb18
--- /dev/null
+++ b/test/correctness/routines/level3/xtrmm.cc
@@ -0,0 +1,96 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the Xtrmm routine.
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/level3/xtrmm.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The correctness tester
+template <typename T>
+void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
+
+  // Creates a tester
+  TestBlas<T,T> tester{argc, argv, silent, name, TestXtrmm<T>::GetOptions(),
+                       TestXtrmm<T>::RunRoutine, TestXtrmm<T>::RunReference,
+                       TestXtrmm<T>::DownloadResult, TestXtrmm<T>::GetResultIndex,
+                       TestXtrmm<T>::ResultID1, TestXtrmm<T>::ResultID2};
+
+  // This variable holds the arguments relevant for this routine
+  auto args = Arguments<T>{};
+
+  // Loops over the test-cases from a data-layout point of view
+  for (auto &layout: tester.kLayouts) { args.layout = layout;
+    for (auto &side: tester.kSides) { args.side = side;
+      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
+        for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
+          for (auto &diagonal: tester.kDiagonals) { args.diagonal = diagonal;
+
+            // Creates the arguments vector for the regular tests
+            auto regular_test_vector = std::vector<Arguments<T>>{};
+            for (auto &m: tester.kMatrixDims) { args.m = m;
+              for (auto &n: tester.kMatrixDims) { args.n = n;
+                for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
+                  for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
+                    for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
+                      for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
+                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
+                          args.a_size = TestXtrmm<T>::GetSizeA(args);
+                          args.b_size = TestXtrmm<T>::GetSizeB(args);
+                          if (args.a_size<1 || args.b_size<1) { continue; }
+                          regular_test_vector.push_back(args);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+
+            // Creates the arguments vector for the invalid-buffer tests
+            auto invalid_test_vector = std::vector<Arguments<T>>{};
+            args.m = args.n = tester.kBufferSize;
+            args.a_ld = args.b_ld = tester.kBufferSize;
+            args.a_offset = args.b_offset = 0;
+            for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
+              for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
+                invalid_test_vector.push_back(args);
+              }
+            }
+
+            // Runs the tests
+            const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle)+" "+
+                                   ToString(a_transpose)+" "+ToString(diagonal);
+            tester.TestRegular(regular_test_vector, case_name);
+            tester.TestInvalid(invalid_test_vector, case_name);
+          }
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::RunTest<float>(argc, argv, false, "STRMM");
+  clblast::RunTest<double>(argc, argv, true, "DTRMM");
+  clblast::RunTest<clblast::float2>(argc, argv, true, "CTRMM");
+  clblast::RunTest<clblast::double2>(argc, argv, true, "ZTRMM");
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/xaxpy.cc b/test/correctness/routines/xaxpy.cc
deleted file mode 100644
index cf23ca9f..00000000
--- a/test/correctness/routines/xaxpy.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xaxpy routine.
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/xaxpy.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester
-template <typename T>
-void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates a tester
-  TestBlas<T,T> tester{argc, argv, silent, name, TestXaxpy<T>::GetOptions(),
-                       TestXaxpy<T>::RunRoutine, TestXaxpy<T>::RunReference,
-                       TestXaxpy<T>::DownloadResult, TestXaxpy<T>::GetResultIndex,
-                       TestXaxpy<T>::ResultID1, TestXaxpy<T>::ResultID2};
-
-  // This variable holds the arguments relevant for this routine
-  auto args = Arguments<T>{};
-
-  // Creates the arguments vector for the regular tests
-  auto regular_test_vector = std::vector<Arguments<T>>{};
-  for (auto &n: tester.kVectorDims) { args.n = n;
-    for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
-      for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
-        for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
-          for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
-            for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
-              args.x_size = TestXaxpy<T>::GetSizeX(args);
-              args.y_size = TestXaxpy<T>::GetSizeY(args);
-              if (args.x_size<1 || args.y_size<1) { continue; }
-              regular_test_vector.push_back(args);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  // Creates the arguments vector for the invalid-buffer tests
-  auto invalid_test_vector = std::vector<Arguments<T>>{};
-  args.n = tester.kBufferSize;
-  args.x_inc = args.y_inc = 1;
-  args.x_offset = args.y_offset = 0;
-  for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
-    for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
-      invalid_test_vector.push_back(args);
-    }
-  }
-
-  // Runs the tests
-  const auto case_name = "default";
-  tester.TestRegular(regular_test_vector, case_name);
-  tester.TestInvalid(invalid_test_vector, case_name);
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTest<float>(argc, argv, false, "SAXPY");
-  clblast::RunTest<double>(argc, argv, true, "DAXPY");
-  clblast::RunTest<clblast::float2>(argc, argv, true, "CAXPY");
-  clblast::RunTest<clblast::double2>(argc, argv, true, "ZAXPY");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xgemm.cc b/test/correctness/routines/xgemm.cc
deleted file mode 100644
index 8a50e1ca..00000000
--- a/test/correctness/routines/xgemm.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xgemm routine.
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/xgemm.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester
-template <typename T>
-void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates a tester
-  TestBlas<T,T> tester{argc, argv, silent, name, TestXgemm<T>::GetOptions(),
-                       TestXgemm<T>::RunRoutine, TestXgemm<T>::RunReference,
-                       TestXgemm<T>::DownloadResult, TestXgemm<T>::GetResultIndex,
-                       TestXgemm<T>::ResultID1, TestXgemm<T>::ResultID2};
-
-  // This variable holds the arguments relevant for this routine
-  auto args = Arguments<T>{};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) { args.layout = layout;
-    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
-      for (auto &b_transpose: tester.kTransposes) { args.b_transpose = b_transpose;
-
-        // Creates the arguments vector for the regular tests
-        auto regular_test_vector = std::vector<Arguments<T>>{};
-        for (auto &m: tester.kMatrixDims) { args.m = m;
-          for (auto &n: tester.kMatrixDims) { args.n = n;
-            for (auto &k: tester.kMatrixDims) { args.k = k;
-              for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
-                for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
-                  for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
-                    for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
-                      for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
-                        for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
-                          for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
-                            for (auto &beta: tester.kBetaValues) { args.beta = beta;
-                              args.a_size = TestXgemm<T>::GetSizeA(args);
-                              args.b_size = TestXgemm<T>::GetSizeB(args);
-                              args.c_size = TestXgemm<T>::GetSizeC(args);
-                              if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
-                              regular_test_vector.push_back(args);
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-
-        // Creates the arguments vector for the invalid-buffer tests
-        auto invalid_test_vector = std::vector<Arguments<T>>{};
-        args.m = args.n = args.k = tester.kBufferSize;
-        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
-        args.a_offset = args.b_offset = args.c_offset = 0;
-        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
-          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
-            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
-              invalid_test_vector.push_back(args);
-            }
-          }
-        }
-
-        // Runs the tests
-        const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
-        tester.TestRegular(regular_test_vector, case_name);
-        tester.TestInvalid(invalid_test_vector, case_name);
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTest<float>(argc, argv, false, "SGEMM");
-  clblast::RunTest<double>(argc, argv, true, "DGEMM");
-  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMM");
-  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xgemv.cc b/test/correctness/routines/xgemv.cc
deleted file mode 100644
index 50ce4699..00000000
--- a/test/correctness/routines/xgemv.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xgemv routine.
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/xgemv.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester
-template <typename T>
-void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates a tester
-  TestBlas<T,T> tester{argc, argv, silent, name, TestXgemv<T>::GetOptions(),
-                       TestXgemv<T>::RunRoutine, TestXgemv<T>::RunReference,
-                       TestXgemv<T>::DownloadResult, TestXgemv<T>::GetResultIndex,
-                       TestXgemv<T>::ResultID1, TestXgemv<T>::ResultID2};
-
-  // This variable holds the arguments relevant for this routine
-  auto args = Arguments<T>{};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) { args.layout = layout;
-    for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
-
-      // Creates the arguments vector for the regular tests
-      auto regular_test_vector = std::vector<Arguments<T>>{};
-      for (auto &m: tester.kMatrixVectorDims) { args.m = m;
-        for (auto &n: tester.kMatrixVectorDims) { args.n = n;
-          for (auto &a_ld: tester.kMatrixVectorDims) { args.a_ld = a_ld;
-            for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
-              for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
-                for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
-                  for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
-                    for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
-                      for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
-                        for (auto &beta: tester.kBetaValues) { args.beta = beta;
-                          args.a_size = TestXgemv<T>::GetSizeA(args);
-                          args.x_size = TestXgemv<T>::GetSizeX(args);
-                          args.y_size = TestXgemv<T>::GetSizeY(args);
-                          if (args.a_size<1 || args.x_size<1 || args.y_size<1) { continue; }
-                          regular_test_vector.push_back(args);
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-
-      // Creates the arguments vector for the invalid-buffer tests
-      auto invalid_test_vector = std::vector<Arguments<T>>{};
-      args.m = args.n = tester.kBufferSize;
-      args.a_ld = tester.kBufferSize;
-      args.x_inc = args.y_inc = 1;
-      args.a_offset = args.x_offset = args.y_offset = 0;
-      for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
-        for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
-          for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
-            invalid_test_vector.push_back(args);
-          }
-        }
-      }
-
-      // Runs the tests
-      const auto case_name = ToString(layout)+" "+ToString(a_transpose);
-      tester.TestRegular(regular_test_vector, case_name);
-      tester.TestInvalid(invalid_test_vector, case_name);
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTest<float>(argc, argv, false, "SGEMV");
-  clblast::RunTest<double>(argc, argv, true, "DGEMV");
-  clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMV");
-  clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xhemm.cc b/test/correctness/routines/xhemm.cc
deleted file mode 100644
index e8c82f65..00000000
--- a/test/correctness/routines/xhemm.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xhemm routine.
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/xhemm.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester
-template <typename T>
-void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates a tester
-  TestBlas<T,T> tester{argc, argv, silent, name, TestXhemm<T>::GetOptions(),
-                       TestXhemm<T>::RunRoutine, TestXhemm<T>::RunReference,
-                       TestXhemm<T>::DownloadResult, TestXhemm<T>::GetResultIndex,
-                       TestXhemm<T>::ResultID1, TestXhemm<T>::ResultID2};
-
-  // This variable holds the arguments relevant for this routine
-  auto args = Arguments<T>{};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) { args.layout = layout;
-    for (auto &side: tester.kSides) { args.side = side;
-      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
-
-        // Creates the arguments vector for the regular tests
-        auto regular_test_vector = std::vector<Arguments<T>>{};
-        for (auto &m: tester.kMatrixDims) { args.m = m;
-          for (auto &n: tester.kMatrixDims) { args.n = n;
-            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
-              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
-                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
-                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
-                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
-                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
-                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
-                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
-                            args.a_size = TestXhemm<T>::GetSizeA(args);
-                            args.b_size = TestXhemm<T>::GetSizeB(args);
-                            args.c_size = TestXhemm<T>::GetSizeC(args);
-                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
-                            regular_test_vector.push_back(args);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-
-        // Creates the arguments vector for the invalid-buffer tests
-        auto invalid_test_vector = std::vector<Arguments<T>>{};
-        args.m = args.n = tester.kBufferSize;
-        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
-        args.a_offset = args.b_offset = args.c_offset = 0;
-        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
-          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
-            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
-              invalid_test_vector.push_back(args);
-            }
-          }
-        }
-
-        // Runs the tests
-        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
-        tester.TestRegular(regular_test_vector, case_name);
-        tester.TestInvalid(invalid_test_vector, case_name);
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTest<clblast::float2>(argc, argv, true, "CHEMM");
-  clblast::RunTest<clblast::double2>(argc, argv, true, "ZHEMM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xher2k.cc b/test/correctness/routines/xher2k.cc
deleted file mode 100644
index 7c0e5a92..00000000
--- a/test/correctness/routines/xher2k.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xher2k routine.
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/xher2k.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester
-template <typename T, typename U>
-void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates a tester
-  TestBlas<T,U> tester{argc, argv, silent, name, TestXher2k<T,U>::GetOptions(),
-                       TestXher2k<T,U>::RunRoutine, TestXher2k<T,U>::RunReference,
-                       TestXher2k<T,U>::DownloadResult, TestXher2k<T,U>::GetResultIndex,
-                       TestXher2k<T,U>::ResultID1, TestXher2k<T,U>::ResultID2};
-
-  // This variable holds the arguments relevant for this routine
-  auto args = Arguments<U>{};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) { args.layout = layout;
-    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
-      for (auto &ab_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
-        args.a_transpose = ab_transpose;                                  // valid BLAS option
-        args.b_transpose = ab_transpose;
-
-        // Creates the arguments vector for the regular tests
-        auto regular_test_vector = std::vector<Arguments<U>>{};
-        for (auto &n: tester.kMatrixDims) { args.n = n;
-          for (auto &k: tester.kMatrixDims) { args.k = k;
-            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
-              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
-                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
-                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
-                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
-                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
-                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
-                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
-                            args.a_size = TestXher2k<T,U>::GetSizeA(args);
-                            args.b_size = TestXher2k<T,U>::GetSizeB(args);
-                            args.c_size = TestXher2k<T,U>::GetSizeC(args);
-                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
-                            regular_test_vector.push_back(args);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-
-        // Creates the arguments vector for the invalid-buffer tests
-        auto invalid_test_vector = std::vector<Arguments<U>>{};
-        args.n = args.k = tester.kBufferSize;
-        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
-        args.a_offset = args.b_offset = args.c_offset = 0;
-        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
-          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
-            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
-              invalid_test_vector.push_back(args);
-            }
-          }
-        }
-
-        // Runs the tests
-        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
-        tester.TestRegular(regular_test_vector, case_name);
-        tester.TestInvalid(invalid_test_vector, case_name);
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHER2K");
-  clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHER2K");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xherk.cc b/test/correctness/routines/xherk.cc
deleted file mode 100644
index dc5c6caf..00000000
--- a/test/correctness/routines/xherk.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xherk routine.
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/xherk.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester
-template <typename T, typename U>
-void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates a tester
-  TestBlas<T,U> tester{argc, argv, silent, name, TestXherk<T,U>::GetOptions(),
-                       TestXherk<T,U>::RunRoutine, TestXherk<T,U>::RunReference,
-                       TestXherk<T,U>::DownloadResult, TestXherk<T,U>::GetResultIndex,
-                       TestXherk<T,U>::ResultID1, TestXherk<T,U>::ResultID2};
-
-  // This variable holds the arguments relevant for this routine
-  auto args = Arguments<U>{};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) { args.layout = layout;
-    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
-      for (auto &a_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
-        args.a_transpose = a_transpose;                                  // valid BLAS option
-
-        // Creates the arguments vector for the regular tests
-        auto regular_test_vector = std::vector<Arguments<U>>{};
-        for (auto &n: tester.kMatrixDims) { args.n = n;
-          for (auto &k: tester.kMatrixDims) { args.k = k;
-            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
-              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
-                for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
-                  for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
-                    for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
-                      for (auto &beta: tester.kBetaValues) { args.beta = beta;
-                        args.a_size = TestXherk<T,U>::GetSizeA(args);
-                        args.c_size = TestXherk<T,U>::GetSizeC(args);
-                        if (args.a_size<1 || args.c_size<1) { continue; }
-                        regular_test_vector.push_back(args);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-
-        // Creates the arguments vector for the invalid-buffer tests
-        auto invalid_test_vector = std::vector<Arguments<U>>{};
-        args.n = args.k = tester.kBufferSize;
-        args.a_ld = args.c_ld = tester.kBufferSize;
-        args.a_offset = args.c_offset = 0;
-        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
-          for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
-            invalid_test_vector.push_back(args);
-          }
-        }
-
-        // Runs the tests
-        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
-        tester.TestRegular(regular_test_vector, case_name);
-        tester.TestInvalid(invalid_test_vector, case_name);
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHERK");
-  clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHERK");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xsymm.cc b/test/correctness/routines/xsymm.cc
deleted file mode 100644
index a919a056..00000000
--- a/test/correctness/routines/xsymm.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xsymm routine.
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/xsymm.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester
-template <typename T>
-void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates a tester
-  TestBlas<T,T> tester{argc, argv, silent, name, TestXsymm<T>::GetOptions(),
-                       TestXsymm<T>::RunRoutine, TestXsymm<T>::RunReference,
-                       TestXsymm<T>::DownloadResult, TestXsymm<T>::GetResultIndex,
-                       TestXsymm<T>::ResultID1, TestXsymm<T>::ResultID2};
-
-  // This variable holds the arguments relevant for this routine
-  auto args = Arguments<T>{};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) { args.layout = layout;
-    for (auto &side: tester.kSides) { args.side = side;
-      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
-
-        // Creates the arguments vector for the regular tests
-        auto regular_test_vector = std::vector<Arguments<T>>{};
-        for (auto &m: tester.kMatrixDims) { args.m = m;
-          for (auto &n: tester.kMatrixDims) { args.n = n;
-            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
-              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
-                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
-                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
-                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
-                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
-                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
-                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
-                            args.a_size = TestXsymm<T>::GetSizeA(args);
-                            args.b_size = TestXsymm<T>::GetSizeB(args);
-                            args.c_size = TestXsymm<T>::GetSizeC(args);
-                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
-                            regular_test_vector.push_back(args);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-
-        // Creates the arguments vector for the invalid-buffer tests
-        auto invalid_test_vector = std::vector<Arguments<T>>{};
-        args.m = args.n = tester.kBufferSize;
-        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
-        args.a_offset = args.b_offset = args.c_offset = 0;
-        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
-          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
-            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
-              invalid_test_vector.push_back(args);
-            }
-          }
-        }
-
-        // Runs the tests
-        const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
-        tester.TestRegular(regular_test_vector, case_name);
-        tester.TestInvalid(invalid_test_vector, case_name);
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTest<float>(argc, argv, false, "SSYMM");
-  clblast::RunTest<double>(argc, argv, true, "DSYMM");
-  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYMM");
-  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYMM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xsyr2k.cc b/test/correctness/routines/xsyr2k.cc
deleted file mode 100644
index 736aa4e5..00000000
--- a/test/correctness/routines/xsyr2k.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xsyr2k routine.
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/xsyr2k.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester
-template <typename T>
-void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates a tester
-  TestBlas<T,T> tester{argc, argv, silent, name, TestXsyr2k<T>::GetOptions(),
-                       TestXsyr2k<T>::RunRoutine, TestXsyr2k<T>::RunReference,
-                       TestXsyr2k<T>::DownloadResult, TestXsyr2k<T>::GetResultIndex,
-                       TestXsyr2k<T>::ResultID1, TestXsyr2k<T>::ResultID2};
-
-  // This variable holds the arguments relevant for this routine
-  auto args = Arguments<T>{};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) { args.layout = layout;
-    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
-      for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
-        args.a_transpose = ab_transpose;                            // is not supported by clBLAS
-        args.b_transpose = ab_transpose;
-
-        // Creates the arguments vector for the regular tests
-        auto regular_test_vector = std::vector<Arguments<T>>{};
-        for (auto &n: tester.kMatrixDims) { args.n = n;
-          for (auto &k: tester.kMatrixDims) { args.k = k;
-            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
-              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
-                for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
-                  for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
-                    for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
-                      for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
-                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
-                          for (auto &beta: tester.kBetaValues) { args.beta = beta;
-                            args.a_size = TestXsyr2k<T>::GetSizeA(args);
-                            args.b_size = TestXsyr2k<T>::GetSizeB(args);
-                            args.c_size = TestXsyr2k<T>::GetSizeC(args);
-                            if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
-                            regular_test_vector.push_back(args);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-
-        // Creates the arguments vector for the invalid-buffer tests
-        auto invalid_test_vector = std::vector<Arguments<T>>{};
-        args.n = args.k = tester.kBufferSize;
-        args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
-        args.a_offset = args.b_offset = args.c_offset = 0;
-        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
-          for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
-            for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
-              invalid_test_vector.push_back(args);
-            }
-          }
-        }
-
-        // Runs the tests
-        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
-        tester.TestRegular(regular_test_vector, case_name);
-        tester.TestInvalid(invalid_test_vector, case_name);
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTest<float>(argc, argv, false, "SSYR2K");
-  clblast::RunTest<double>(argc, argv, true, "DSYR2K");
-  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYR2K");
-  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYR2K");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xsyrk.cc b/test/correctness/routines/xsyrk.cc
deleted file mode 100644
index a62a0ebf..00000000
--- a/test/correctness/routines/xsyrk.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xsyrk routine.
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/xsyrk.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester
-template <typename T>
-void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates a tester
-  TestBlas<T,T> tester{argc, argv, silent, name, TestXsyrk<T>::GetOptions(),
-                       TestXsyrk<T>::RunRoutine, TestXsyrk<T>::RunReference,
-                       TestXsyrk<T>::DownloadResult, TestXsyrk<T>::GetResultIndex,
-                       TestXsyrk<T>::ResultID1, TestXsyrk<T>::ResultID2};
-
-  // This variable holds the arguments relevant for this routine
-  auto args = Arguments<T>{};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) { args.layout = layout;
-    for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
-      for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
-        args.a_transpose = a_transpose;                            // is not supported by clBLAS
-
-        // Creates the arguments vector for the regular tests
-        auto regular_test_vector = std::vector<Arguments<T>>{};
-        for (auto &n: tester.kMatrixDims) { args.n = n;
-          for (auto &k: tester.kMatrixDims) { args.k = k;
-            for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
-              for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
-                for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
-                  for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
-                    for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
-                      for (auto &beta: tester.kBetaValues) { args.beta = beta;
-                        args.a_size = TestXsyrk<T>::GetSizeA(args);
-                        args.c_size = TestXsyrk<T>::GetSizeC(args);
-                        if (args.a_size<1 || args.c_size<1) { continue; }
-                        regular_test_vector.push_back(args);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-
-        // Creates the arguments vector for the invalid-buffer tests
-        auto invalid_test_vector = std::vector<Arguments<T>>{};
-        args.n = args.k = tester.kBufferSize;
-        args.a_ld = args.c_ld = tester.kBufferSize;
-        args.a_offset = args.c_offset = 0;
-        for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
-          for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
-            invalid_test_vector.push_back(args);
-          }
-        }
-
-        // Runs the tests
-        const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
-        tester.TestRegular(regular_test_vector, case_name);
-        tester.TestInvalid(invalid_test_vector, case_name);
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTest<float>(argc, argv, false, "SSYRK");
-  clblast::RunTest<double>(argc, argv, true, "DSYRK");
-  clblast::RunTest<clblast::float2>(argc, argv, true, "CSYRK");
-  clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYRK");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/xtrmm.cc b/test/correctness/routines/xtrmm.cc
deleted file mode 100644
index 0bb6294c..00000000
--- a/test/correctness/routines/xtrmm.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the tests for the Xtrmm routine.
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/xtrmm.h"
-
-namespace clblast {
-// =================================================================================================
-
-// The correctness tester
-template <typename T>
-void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
-
-  // Creates a tester
-  TestBlas<T,T> tester{argc, argv, silent, name, TestXtrmm<T>::GetOptions(),
-                       TestXtrmm<T>::RunRoutine, TestXtrmm<T>::RunReference,
-                       TestXtrmm<T>::DownloadResult, TestXtrmm<T>::GetResultIndex,
-                       TestXtrmm<T>::ResultID1, TestXtrmm<T>::ResultID2};
-
-  // This variable holds the arguments relevant for this routine
-  auto args = Arguments<T>{};
-
-  // Loops over the test-cases from a data-layout point of view
-  for (auto &layout: tester.kLayouts) { args.layout = layout;
-    for (auto &side: tester.kSides) { args.side = side;
-      for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
-        for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
-          for (auto &diagonal: tester.kDiagonals) { args.diagonal = diagonal;
-
-            // Creates the arguments vector for the regular tests
-            auto regular_test_vector = std::vector<Arguments<T>>{};
-            for (auto &m: tester.kMatrixDims) { args.m = m;
-              for (auto &n: tester.kMatrixDims) { args.n = n;
-                for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
-                  for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
-                    for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
-                      for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
-                        for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
-                          args.a_size = TestXtrmm<T>::GetSizeA(args);
-                          args.b_size = TestXtrmm<T>::GetSizeB(args);
-                          if (args.a_size<1 || args.b_size<1) { continue; }
-                          regular_test_vector.push_back(args);
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-
-            // Creates the arguments vector for the invalid-buffer tests
-            auto invalid_test_vector = std::vector<Arguments<T>>{};
-            args.m = args.n = tester.kBufferSize;
-            args.a_ld = args.b_ld = tester.kBufferSize;
-            args.a_offset = args.b_offset = 0;
-            for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
-              for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
-                invalid_test_vector.push_back(args);
-              }
-            }
-
-            // Runs the tests
-            const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle)+" "+
-                                   ToString(a_transpose)+" "+ToString(diagonal);
-            tester.TestRegular(regular_test_vector, case_name);
-            tester.TestInvalid(invalid_test_vector, case_name);
-          }
-        }
-      }
-    }
-  }
-}
-
-// =================================================================================================
-} // namespace clblast
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTest<float>(argc, argv, false, "STRMM");
-  clblast::RunTest<double>(argc, argv, true, "DTRMM");
-  clblast::RunTest<clblast::float2>(argc, argv, true, "CTRMM");
-  clblast::RunTest<clblast::double2>(argc, argv, true, "ZTRMM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/level1/xaxpy.cc b/test/performance/routines/level1/xaxpy.cc
new file mode 100644
index 00000000..fe90c697
--- /dev/null
+++ b/test/performance/routines/level1/xaxpy.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level1/xaxpy.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXaxpy<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXaxpy<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXaxpy<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXaxpy<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level2/xgemv.cc b/test/performance/routines/level2/xgemv.cc
new file mode 100644
index 00000000..376c6c33
--- /dev/null
+++ b/test/performance/routines/level2/xgemv.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemv command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level2/xgemv.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXgemv<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXgemv<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXgemv<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXgemv<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xgemm.cc b/test/performance/routines/level3/xgemm.cc
new file mode 100644
index 00000000..c45c238f
--- /dev/null
+++ b/test/performance/routines/level3/xgemm.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xgemm.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXgemm<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXgemm<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXgemm<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXgemm<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xhemm.cc b/test/performance/routines/level3/xhemm.cc
new file mode 100644
index 00000000..d215653b
--- /dev/null
+++ b/test/performance/routines/level3/xhemm.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemm command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xhemm.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kDouble:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXhemm<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXhemm<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xher2k.cc b/test/performance/routines/level3/xher2k.cc
new file mode 100644
index 00000000..2e1f248a
--- /dev/null
+++ b/test/performance/routines/level3/xher2k.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2k command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xher2k.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kDouble:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXher2k<float2,float>, float2, float>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXher2k<double2,double>, double2, double>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xherk.cc b/test/performance/routines/level3/xherk.cc
new file mode 100644
index 00000000..4386f78c
--- /dev/null
+++ b/test/performance/routines/level3/xherk.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xherk command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xherk.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kDouble:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXherk<float2,float>, float2, float>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXherk<double2,double>, double2, double>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xsymm.cc b/test/performance/routines/level3/xsymm.cc
new file mode 100644
index 00000000..bd014cee
--- /dev/null
+++ b/test/performance/routines/level3/xsymm.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xsymm.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXsymm<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXsymm<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXsymm<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXsymm<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xsyr2k.cc b/test/performance/routines/level3/xsyr2k.cc
new file mode 100644
index 00000000..1261be88
--- /dev/null
+++ b/test/performance/routines/level3/xsyr2k.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xsyr2k.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXsyr2k<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXsyr2k<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXsyr2k<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXsyr2k<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xsyrk.cc b/test/performance/routines/level3/xsyrk.cc
new file mode 100644
index 00000000..5799130f
--- /dev/null
+++ b/test/performance/routines/level3/xsyrk.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xsyrk.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXsyrk<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXsyrk<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXsyrk<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXsyrk<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/level3/xtrmm.cc b/test/performance/routines/level3/xtrmm.cc
new file mode 100644
index 00000000..c30866e9
--- /dev/null
+++ b/test/performance/routines/level3/xtrmm.cc
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmm command-line interface performance tester.
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/level3/xtrmm.h"
+
+// =================================================================================================
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf:
+      throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXtrmm<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXtrmm<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXtrmm<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXtrmm<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/xaxpy.cc b/test/performance/routines/xaxpy.cc
deleted file mode 100644
index 6a2b96c1..00000000
--- a/test/performance/routines/xaxpy.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xaxpy command-line interface performance tester.
-//
-// =================================================================================================
-
-#include "performance/client.h"
-#include "routines/xaxpy.h"
-
-// =================================================================================================
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kSingle:
-      clblast::RunClient<clblast::TestXaxpy<float>, float, float>(argc, argv); break;
-    case clblast::Precision::kDouble:
-      clblast::RunClient<clblast::TestXaxpy<double>, double, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXaxpy<float2>, float2, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXaxpy<double2>, double2, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xgemm.cc b/test/performance/routines/xgemm.cc
deleted file mode 100644
index 9a02e595..00000000
--- a/test/performance/routines/xgemm.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemm command-line interface performance tester.
-//
-// =================================================================================================
-
-#include "performance/client.h"
-#include "routines/xgemm.h"
-
-// =================================================================================================
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kSingle:
-      clblast::RunClient<clblast::TestXgemm<float>, float, float>(argc, argv); break;
-    case clblast::Precision::kDouble:
-      clblast::RunClient<clblast::TestXgemm<double>, double, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXgemm<float2>, float2, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXgemm<double2>, double2, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xgemv.cc b/test/performance/routines/xgemv.cc
deleted file mode 100644
index 6f69ef21..00000000
--- a/test/performance/routines/xgemv.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemv command-line interface performance tester.
-//
-// =================================================================================================
-
-#include "performance/client.h"
-#include "routines/xgemv.h"
-
-// =================================================================================================
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kSingle:
-      clblast::RunClient<clblast::TestXgemv<float>, float, float>(argc, argv); break;
-    case clblast::Precision::kDouble:
-      clblast::RunClient<clblast::TestXgemv<double>, double, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXgemv<float2>, float2, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXgemv<double2>, double2, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xhemm.cc b/test/performance/routines/xhemm.cc
deleted file mode 100644
index 34798d8d..00000000
--- a/test/performance/routines/xhemm.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xhemm command-line interface performance tester.
-//
-// =================================================================================================
-
-#include "performance/client.h"
-#include "routines/xhemm.h"
-
-// =================================================================================================
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kSingle:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kDouble:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXhemm<float2>, float2, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXhemm<double2>, double2, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xher2k.cc b/test/performance/routines/xher2k.cc
deleted file mode 100644
index 1b505737..00000000
--- a/test/performance/routines/xher2k.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xher2k command-line interface performance tester.
-//
-// =================================================================================================
-
-#include "performance/client.h"
-#include "routines/xher2k.h"
-
-// =================================================================================================
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kSingle:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kDouble:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXher2k<float2,float>, float2, float>(argc, argv); break;
-    case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXher2k<double2,double>, double2, double>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xherk.cc b/test/performance/routines/xherk.cc
deleted file mode 100644
index ce18152e..00000000
--- a/test/performance/routines/xherk.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xherk command-line interface performance tester.
-//
-// =================================================================================================
-
-#include "performance/client.h"
-#include "routines/xherk.h"
-
-// =================================================================================================
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kSingle:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kDouble:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXherk<float2,float>, float2, float>(argc, argv); break;
-    case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXherk<double2,double>, double2, double>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xsymm.cc b/test/performance/routines/xsymm.cc
deleted file mode 100644
index 8738ceda..00000000
--- a/test/performance/routines/xsymm.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsymm command-line interface performance tester.
-//
-// =================================================================================================
-
-#include "performance/client.h"
-#include "routines/xsymm.h"
-
-// =================================================================================================
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kSingle:
-      clblast::RunClient<clblast::TestXsymm<float>, float, float>(argc, argv); break;
-    case clblast::Precision::kDouble:
-      clblast::RunClient<clblast::TestXsymm<double>, double, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXsymm<float2>, float2, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXsymm<double2>, double2, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xsyr2k.cc b/test/performance/routines/xsyr2k.cc
deleted file mode 100644
index e4c76229..00000000
--- a/test/performance/routines/xsyr2k.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyr2k command-line interface performance tester.
-//
-// =================================================================================================
-
-#include "performance/client.h"
-#include "routines/xsyr2k.h"
-
-// =================================================================================================
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kSingle:
-      clblast::RunClient<clblast::TestXsyr2k<float>, float, float>(argc, argv); break;
-    case clblast::Precision::kDouble:
-      clblast::RunClient<clblast::TestXsyr2k<double>, double, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXsyr2k<float2>, float2, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXsyr2k<double2>, double2, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xsyrk.cc b/test/performance/routines/xsyrk.cc
deleted file mode 100644
index 53fecb69..00000000
--- a/test/performance/routines/xsyrk.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyrk command-line interface performance tester.
-//
-// =================================================================================================
-
-#include "performance/client.h"
-#include "routines/xsyrk.h"
-
-// =================================================================================================
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kSingle:
-      clblast::RunClient<clblast::TestXsyrk<float>, float, float>(argc, argv); break;
-    case clblast::Precision::kDouble:
-      clblast::RunClient<clblast::TestXsyrk<double>, double, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXsyrk<float2>, float2, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXsyrk<double2>, double2, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/performance/routines/xtrmm.cc b/test/performance/routines/xtrmm.cc
deleted file mode 100644
index 2ab9ce77..00000000
--- a/test/performance/routines/xtrmm.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xtrmm command-line interface performance tester.
-//
-// =================================================================================================
-
-#include "performance/client.h"
-#include "routines/xtrmm.h"
-
-// =================================================================================================
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf:
-      throw std::runtime_error("Unsupported precision mode");
-    case clblast::Precision::kSingle:
-      clblast::RunClient<clblast::TestXtrmm<float>, float, float>(argc, argv); break;
-    case clblast::Precision::kDouble:
-      clblast::RunClient<clblast::TestXtrmm<double>, double, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle:
-      clblast::RunClient<clblast::TestXtrmm<float2>, float2, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble:
-      clblast::RunClient<clblast::TestXtrmm<double2>, double2, double2>(argc, argv); break;
-  }
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h
new file mode 100644
index 00000000..6ce5d7e2
--- /dev/null
+++ b/test/routines/level1/xaxpy.h
@@ -0,0 +1,113 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xaxpy routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XAXPY_H_
+#define CLBLAST_TEST_ROUTINES_XAXPY_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXaxpy {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgXInc, kArgYInc,
+            kArgXOffset, kArgYOffset,
+            kArgAlpha};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    return args.n * args.y_inc + args.y_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Axpy(args.n, args.alpha,
+                       buffers.x_vec(), args.x_offset, args.x_inc,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXaxpy(args.n, args.alpha,
+                              buffers.x_vec(), args.x_offset, args.x_inc,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (3 * args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XAXPY_H_
+#endif
diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h
new file mode 100644
index 00000000..73f7d76e
--- /dev/null
+++ b/test/routines/level2/xgemv.h
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xgemv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XGEMV_H_
+#define CLBLAST_TEST_ROUTINES_XGEMV_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXgemv {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgATransp, 
+            kArgALeadDim, kArgXInc, kArgYInc,
+            kArgAOffset, kArgXOffset, kArgYOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_two = (a_rotated) ? args.m : args.n;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeX(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    auto n_real = (a_transposed) ? args.m : args.n;
+    return n_real * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeY(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    auto m_real = (a_transposed) ? args.n : args.m;
+    return m_real * args.y_inc + args.y_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+    args.y_size = GetSizeY(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Gemv(args.layout, args.a_transpose,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                       buffers.y_vec(), args.y_offset, args.y_inc,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
+                              buffers.y_vec(), args.y_offset, args.y_inc,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.y_size, static_cast<T>(0));
+    buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    auto a_transposed = (args.a_transpose != Transpose::kNo);
+    return (a_transposed) ? args.n : args.m;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.y_inc + args.y_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.m*args.n + 2*args.m + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XGEMV_H_
+#endif
diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h
new file mode 100644
index 00000000..86a304d1
--- /dev/null
+++ b/test/routines/level3/xgemm.h
@@ -0,0 +1,134 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xgemm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XGEMM_H_
+#define CLBLAST_TEST_ROUTINES_XGEMM_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXgemm {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN, kArgK,
+            kArgLayout, kArgATransp, kArgBTransp,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.m : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+    auto b_two = (b_rotated) ? args.k : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    auto c_rotated = (args.layout == Layout::kRowMajor);
+    auto c_two = (c_rotated) ? args.m : args.n;
+    return c_two * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
+                       args.m, args.n, args.k, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              static_cast<clblasTranspose>(args.b_transpose),
+                              args.m, args.n, args.k, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.c_ld + id2 + args.c_offset:
+           id2*args.c_ld + id1 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n * args.m;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XGEMM_H_
+#endif
diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h
new file mode 100644
index 00000000..75878b06
--- /dev/null
+++ b/test/routines/level3/xhemm.h
@@ -0,0 +1,134 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xhemm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XHEMM_H_
+#define CLBLAST_TEST_ROUTINES_XHEMM_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXhemm {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgSide, kArgTriangle,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_two = (a_rotated) ? args.m : k_value;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
+    auto b_rotated = (args.layout == Layout::kRowMajor);
+    auto b_two = (b_rotated) ? k_value : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    auto c_rotated = (args.layout == Layout::kRowMajor);
+    auto c_two = (c_rotated) ? args.m : args.n;
+    return c_two * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Hemm(args.layout, args.side, args.triangle,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXhemm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasSide>(args.side),
+                              static_cast<clblasUplo>(args.triangle),
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.c_ld + id2 + args.c_offset:
+           id2*args.c_ld + id1 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n * args.m;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XHEMM_H_
+#endif
diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h
new file mode 100644
index 00000000..f13e8a62
--- /dev/null
+++ b/test/routines/level3/xher2k.h
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xher2k routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XHER2K_H_
+#define CLBLAST_TEST_ROUTINES_XHER2K_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class TestXher2k {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgK,
+            kArgLayout, kArgTriangle, kArgATransp,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<U> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.n : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<U> &args) {
+    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+    auto b_two = (b_rotated) ? args.n : args.k;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<U> &args) {
+    return args.n * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<U> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<U> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<U> &args) { return args.k; }
+  static size_t DefaultLDC(const Arguments<U> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<U> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto alpha2 = T{args.alpha, args.alpha};
+    auto status = Her2k(args.layout, args.triangle, args.a_transpose,
+                        args.n, args.k, alpha2,
+                        buffers.a_mat(), args.a_offset, args.a_ld,
+                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                        buffers.c_mat(), args.c_offset, args.c_ld,
+                        &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<U> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto alpha2 = T{args.alpha, args.alpha};
+    auto status = clblasXher2k(static_cast<clblasOrder>(args.layout),
+                               static_cast<clblasUplo>(args.triangle),
+                               static_cast<clblasTranspose>(args.a_transpose),
+                               args.n, args.k, alpha2,
+                               buffers.a_mat(), args.a_offset, args.a_ld,
+                               buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                               buffers.c_mat(), args.c_offset, args.c_ld,
+                               1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<U> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<U> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<U> &args, const size_t id1, const size_t id2) {
+    return id1*args.c_ld + id2 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<U> &args) {
+    return 2 * args.n * args.n * args.k;
+  }
+  static size_t GetBytes(const Arguments<U> &args) {
+    return (args.n*args.k + args.n*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XHER2K_H_
+#endif
diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h
new file mode 100644
index 00000000..780b9b52
--- /dev/null
+++ b/test/routines/level3/xherk.h
@@ -0,0 +1,121 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xherk routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XHERK_H_
+#define CLBLAST_TEST_ROUTINES_XHERK_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, typename U>
+class TestXherk {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgK,
+            kArgLayout, kArgTriangle, kArgATransp,
+            kArgALeadDim, kArgCLeadDim,
+            kArgAOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<U> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.n : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeC(const Arguments<U> &args) {
+    return args.n * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<U> &args) {
+    args.a_size = GetSizeA(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<U> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<U> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<U> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<U> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Herk(args.layout, args.triangle, args.a_transpose,
+                       args.n, args.k, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<U> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXherk(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasUplo>(args.triangle),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.n, args.k, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<U> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<U> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<U> &args, const size_t id1, const size_t id2) {
+    return id1*args.c_ld + id2 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<U> &args) {
+    return args.n * args.n * args.k;
+  }
+  static size_t GetBytes(const Arguments<U> &args) {
+    return (args.n*args.k + args.n*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XHERK_H_
+#endif
diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h
new file mode 100644
index 00000000..10476349
--- /dev/null
+++ b/test/routines/level3/xsymm.h
@@ -0,0 +1,134 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xsymm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSYMM_H_
+#define CLBLAST_TEST_ROUTINES_XSYMM_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXsymm {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgSide, kArgTriangle,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
+    auto a_rotated = (args.layout == Layout::kRowMajor);
+    auto a_two = (a_rotated) ? args.m : k_value;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
+    auto b_rotated = (args.layout == Layout::kRowMajor);
+    auto b_two = (b_rotated) ? k_value : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    auto c_rotated = (args.layout == Layout::kRowMajor);
+    auto c_two = (c_rotated) ? args.m : args.n;
+    return c_two * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Symm(args.layout, args.side, args.triangle,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasSide>(args.side),
+                              static_cast<clblasUplo>(args.triangle),
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.c_ld + id2 + args.c_offset:
+           id2*args.c_ld + id1 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.m * args.n * args.m;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSYMM_H_
+#endif
diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h
new file mode 100644
index 00000000..f3b1b542
--- /dev/null
+++ b/test/routines/level3/xsyr2k.h
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xsyr2k routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSYR2K_H_
+#define CLBLAST_TEST_ROUTINES_XSYR2K_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXsyr2k {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgK,
+            kArgLayout, kArgTriangle, kArgATransp,
+            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
+            kArgAOffset, kArgBOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.n : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+    auto b_two = (b_rotated) ? args.n : args.k;
+    return b_two * args.b_ld + args.b_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    return args.n * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
+                        args.n, args.k, args.alpha,
+                        buffers.a_mat(), args.a_offset, args.a_ld,
+                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                        buffers.c_mat(), args.c_offset, args.c_ld,
+                        &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
+                               static_cast<clblasUplo>(args.triangle),
+                               static_cast<clblasTranspose>(args.a_transpose),
+                               args.n, args.k, args.alpha,
+                               buffers.a_mat(), args.a_offset, args.a_ld,
+                               buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
+                               buffers.c_mat(), args.c_offset, args.c_ld,
+                               1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return id1*args.c_ld + id2 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n * args.k;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.n*args.k + args.n*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSYR2K_H_
+#endif
diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h
new file mode 100644
index 00000000..2ec9fb65
--- /dev/null
+++ b/test/routines/level3/xsyrk.h
@@ -0,0 +1,121 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xsyrk routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XSYRK_H_
+#define CLBLAST_TEST_ROUTINES_XSYRK_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXsyrk {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgK,
+            kArgLayout, kArgTriangle, kArgATransp,
+            kArgALeadDim, kArgCLeadDim,
+            kArgAOffset, kArgCOffset,
+            kArgAlpha, kArgBeta};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto a_two = (a_rotated) ? args.n : args.k;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeC(const Arguments<T> &args) {
+    return args.n * args.c_ld + args.c_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.c_size = GetSizeC(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Syrk(args.layout, args.triangle, args.a_transpose,
+                       args.n, args.k, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+                       buffers.c_mat(), args.c_offset, args.c_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasUplo>(args.triangle),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              args.n, args.k, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
+                              buffers.c_mat(), args.c_offset, args.c_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.c_size, static_cast<T>(0));
+    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return id1*args.c_ld + id2 + args.c_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return args.n * args.n * args.k;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.n*args.k + args.n*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XSYRK_H_
+#endif
diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h
new file mode 100644
index 00000000..7b7e7af1
--- /dev/null
+++ b/test/routines/level3/xtrmm.h
@@ -0,0 +1,127 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtrmm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTRMM_H_
+#define CLBLAST_TEST_ROUTINES_XTRMM_H_
+
+#include <vector>
+#include <string>
+
+#include "wrapper_clblas.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtrmm {
+ public:
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgALeadDim, kArgBLeadDim,
+            kArgAOffset, kArgBOffset,
+            kArgAlpha};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return k * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    auto b_rotated = (args.layout == Layout::kRowMajor);
+    auto b_two = (b_rotated) ? args.m : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
+                               CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld,
+                       &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
+                                 CommandQueue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout),
+                              static_cast<clblasSide>(args.side),
+                              static_cast<clblasUplo>(args.triangle),
+                              static_cast<clblasTranspose>(args.a_transpose),
+                              static_cast<clblasDiag>(args.diagonal),
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld,
+                              1, &queue_plain, 0, nullptr, &event);
+    clWaitForEvents(1, &event);
+    return static_cast<StatusCode>(status);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
+                                       CommandQueue &queue) {
+    std::vector<T> result(args.b_size, static_cast<T>(0));
+    buffers.b_mat.ReadBuffer(queue, args.b_size*sizeof(T), result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.b_ld + id2 + args.b_offset:
+           id2*args.b_ld + id1 + args.b_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return args.m * args.n * k;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return (k*k + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTRMM_H_
+#endif
diff --git a/test/routines/xaxpy.h b/test/routines/xaxpy.h
deleted file mode 100644
index 6ce5d7e2..00000000
--- a/test/routines/xaxpy.h
+++ /dev/null
@@ -1,113 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements a class with static methods to describe the Xaxpy routine. Examples of
-// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
-// static methods are used by the correctness tester and the performance tester.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_ROUTINES_XAXPY_H_
-#define CLBLAST_TEST_ROUTINES_XAXPY_H_
-
-#include <vector>
-#include <string>
-
-#include "wrapper_clblas.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestXaxpy {
- public:
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() {
-    return {kArgN,
-            kArgXInc, kArgYInc,
-            kArgXOffset, kArgYOffset,
-            kArgAlpha};
-  }
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeX(const Arguments<T> &args) {
-    return args.n * args.x_inc + args.x_offset;
-  }
-  static size_t GetSizeY(const Arguments<T> &args) {
-    return args.n * args.y_inc + args.y_offset;
-  }
-
-  // Describes how to set the sizes of all the buffers
-  static void SetSizes(Arguments<T> &args) {
-    args.x_size = GetSizeX(args);
-    args.y_size = GetSizeY(args);
-  }
-
-  // Describes what the default values of the leading dimensions of the matrices are
-  static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
-  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
-  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
-
-  // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
-                               CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Axpy(args.n, args.alpha,
-                       buffers.x_vec(), args.x_offset, args.x_inc,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    return status;
-  }
-
-  // Describes how to run the clBLAS routine (for correctness/performance comparison)
-  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
-                                 CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXaxpy(args.n, args.alpha,
-                              buffers.x_vec(), args.x_offset, args.x_inc,
-                              buffers.y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    return static_cast<StatusCode>(status);
-  }
-
-  // Describes how to download the results of the computation (more importantly: which buffer)
-  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
-                                       CommandQueue &queue) {
-    std::vector<T> result(args.y_size, static_cast<T>(0));
-    buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result);
-    return result;
-  }
-
-  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
-  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
-  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
-    return id1*args.y_inc + args.y_offset;
-  }
-
-  // Describes how to compute performance metrics
-  static size_t GetFlops(const Arguments<T> &args) {
-    return 2 * args.n;
-  }
-  static size_t GetBytes(const Arguments<T> &args) {
-    return (3 * args.n) * sizeof(T);
-  }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_ROUTINES_XAXPY_H_
-#endif
diff --git a/test/routines/xgemm.h b/test/routines/xgemm.h
deleted file mode 100644
index 86a304d1..00000000
--- a/test/routines/xgemm.h
+++ /dev/null
@@ -1,134 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements a class with static methods to describe the Xgemm routine. Examples of
-// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
-// static methods are used by the correctness tester and the performance tester.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_ROUTINES_XGEMM_H_
-#define CLBLAST_TEST_ROUTINES_XGEMM_H_
-
-#include <vector>
-#include <string>
-
-#include "wrapper_clblas.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestXgemm {
- public:
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() {
-    return {kArgM, kArgN, kArgK,
-            kArgLayout, kArgATransp, kArgBTransp,
-            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-            kArgAOffset, kArgBOffset, kArgCOffset,
-            kArgAlpha, kArgBeta};
-  }
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeA(const Arguments<T> &args) {
-    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-    auto a_two = (a_rotated) ? args.m : args.k;
-    return a_two * args.a_ld + args.a_offset;
-  }
-  static size_t GetSizeB(const Arguments<T> &args) {
-    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
-                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
-    auto b_two = (b_rotated) ? args.k : args.n;
-    return b_two * args.b_ld + args.b_offset;
-  }
-  static size_t GetSizeC(const Arguments<T> &args) {
-    auto c_rotated = (args.layout == Layout::kRowMajor);
-    auto c_two = (c_rotated) ? args.m : args.n;
-    return c_two * args.c_ld + args.c_offset;
-  }
-
-  // Describes how to set the sizes of all the buffers
-  static void SetSizes(Arguments<T> &args) {
-    args.a_size = GetSizeA(args);
-    args.b_size = GetSizeB(args);
-    args.c_size = GetSizeC(args);
-  }
-
-  // Describes what the default values of the leading dimensions of the matrices are
-  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
-  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
-  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
-
-  // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
-                               CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
-                       args.m, args.n, args.k, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                       buffers.c_mat(), args.c_offset, args.c_ld,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    return status;
-  }
-
-  // Describes how to run the clBLAS routine (for correctness/performance comparison)
-  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
-                                 CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              static_cast<clblasTranspose>(args.b_transpose),
-                              args.m, args.n, args.k, args.alpha,
-                              buffers.a_mat(), args.a_offset, args.a_ld,
-                              buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                              buffers.c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    return static_cast<StatusCode>(status);
-  }
-
-  // Describes how to download the results of the computation (more importantly: which buffer)
-  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
-                                       CommandQueue &queue) {
-    std::vector<T> result(args.c_size, static_cast<T>(0));
-    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
-    return result;
-  }
-
-  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
-  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
-  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
-    return (args.layout == Layout::kRowMajor) ?
-           id1*args.c_ld + id2 + args.c_offset:
-           id2*args.c_ld + id1 + args.c_offset;
-  }
-
-  // Describes how to compute performance metrics
-  static size_t GetFlops(const Arguments<T> &args) {
-    return 2 * args.m * args.n * args.m;
-  }
-  static size_t GetBytes(const Arguments<T> &args) {
-    return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
-  }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_ROUTINES_XGEMM_H_
-#endif
diff --git a/test/routines/xgemv.h b/test/routines/xgemv.h
deleted file mode 100644
index 73f7d76e..00000000
--- a/test/routines/xgemv.h
+++ /dev/null
@@ -1,132 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements a class with static methods to describe the Xgemv routine. Examples of
-// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
-// static methods are used by the correctness tester and the performance tester.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_ROUTINES_XGEMV_H_
-#define CLBLAST_TEST_ROUTINES_XGEMV_H_
-
-#include <vector>
-#include <string>
-
-#include "wrapper_clblas.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestXgemv {
- public:
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() {
-    return {kArgM, kArgN,
-            kArgLayout, kArgATransp, 
-            kArgALeadDim, kArgXInc, kArgYInc,
-            kArgAOffset, kArgXOffset, kArgYOffset,
-            kArgAlpha, kArgBeta};
-  }
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeA(const Arguments<T> &args) {
-    auto a_rotated = (args.layout == Layout::kRowMajor);
-    auto a_two = (a_rotated) ? args.m : args.n;
-    return a_two * args.a_ld + args.a_offset;
-  }
-  static size_t GetSizeX(const Arguments<T> &args) {
-    auto a_transposed = (args.a_transpose != Transpose::kNo);
-    auto n_real = (a_transposed) ? args.m : args.n;
-    return n_real * args.x_inc + args.x_offset;
-  }
-  static size_t GetSizeY(const Arguments<T> &args) {
-    auto a_transposed = (args.a_transpose != Transpose::kNo);
-    auto m_real = (a_transposed) ? args.n : args.m;
-    return m_real * args.y_inc + args.y_offset;
-  }
-
-  // Describes how to set the sizes of all the buffers
-  static void SetSizes(Arguments<T> &args) {
-    args.a_size = GetSizeA(args);
-    args.x_size = GetSizeX(args);
-    args.y_size = GetSizeY(args);
-  }
-
-  // Describes what the default values of the leading dimensions of the matrices are
-  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
-  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
-  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
-
-  // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
-                               CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Gemv(args.layout, args.a_transpose,
-                       args.m, args.n, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                       buffers.y_vec(), args.y_offset, args.y_inc,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    return status;
-  }
-
-  // Describes how to run the clBLAS routine (for correctness/performance comparison)
-  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
-                                 CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              args.m, args.n, args.alpha,
-                              buffers.a_mat(), args.a_offset, args.a_ld,
-                              buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                              buffers.y_vec(), args.y_offset, args.y_inc,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    return static_cast<StatusCode>(status);
-  }
-
-  // Describes how to download the results of the computation (more importantly: which buffer)
-  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
-                                       CommandQueue &queue) {
-    std::vector<T> result(args.y_size, static_cast<T>(0));
-    buffers.y_vec.ReadBuffer(queue, args.y_size*sizeof(T), result);
-    return result;
-  }
-
-  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) {
-    auto a_transposed = (args.a_transpose != Transpose::kNo);
-    return (a_transposed) ? args.n : args.m;
-  }
-  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
-  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
-    return id1*args.y_inc + args.y_offset;
-  }
-
-  // Describes how to compute performance metrics
-  static size_t GetFlops(const Arguments<T> &args) {
-    return 2 * args.m * args.n;
-  }
-  static size_t GetBytes(const Arguments<T> &args) {
-    return (args.m*args.n + 2*args.m + args.n) * sizeof(T);
-  }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_ROUTINES_XGEMV_H_
-#endif
diff --git a/test/routines/xhemm.h b/test/routines/xhemm.h
deleted file mode 100644
index 75878b06..00000000
--- a/test/routines/xhemm.h
+++ /dev/null
@@ -1,134 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements a class with static methods to describe the Xhemm routine. Examples of
-// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
-// static methods are used by the correctness tester and the performance tester.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_ROUTINES_XHEMM_H_
-#define CLBLAST_TEST_ROUTINES_XHEMM_H_
-
-#include <vector>
-#include <string>
-
-#include "wrapper_clblas.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestXhemm {
- public:
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() {
-    return {kArgM, kArgN,
-            kArgLayout, kArgSide, kArgTriangle,
-            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-            kArgAOffset, kArgBOffset, kArgCOffset,
-            kArgAlpha, kArgBeta};
-  }
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeA(const Arguments<T> &args) {
-    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
-    auto a_rotated = (args.layout == Layout::kRowMajor);
-    auto a_two = (a_rotated) ? args.m : k_value;
-    return a_two * args.a_ld + args.a_offset;
-  }
-  static size_t GetSizeB(const Arguments<T> &args) {
-    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
-    auto b_rotated = (args.layout == Layout::kRowMajor);
-    auto b_two = (b_rotated) ? k_value : args.n;
-    return b_two * args.b_ld + args.b_offset;
-  }
-  static size_t GetSizeC(const Arguments<T> &args) {
-    auto c_rotated = (args.layout == Layout::kRowMajor);
-    auto c_two = (c_rotated) ? args.m : args.n;
-    return c_two * args.c_ld + args.c_offset;
-  }
-
-  // Describes how to set the sizes of all the buffers
-  static void SetSizes(Arguments<T> &args) {
-    args.a_size = GetSizeA(args);
-    args.b_size = GetSizeB(args);
-    args.c_size = GetSizeC(args);
-  }
-
-  // Describes what the default values of the leading dimensions of the matrices are
-  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
-  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
-  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
-
-  // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
-                               CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Hemm(args.layout, args.side, args.triangle,
-                       args.m, args.n, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                       buffers.c_mat(), args.c_offset, args.c_ld,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    return status;
-  }
-
-  // Describes how to run the clBLAS routine (for correctness/performance comparison)
-  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
-                                 CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXhemm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasSide>(args.side),
-                              static_cast<clblasUplo>(args.triangle),
-                              args.m, args.n, args.alpha,
-                              buffers.a_mat(), args.a_offset, args.a_ld,
-                              buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                              buffers.c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    return static_cast<StatusCode>(status);
-  }
-
-  // Describes how to download the results of the computation (more importantly: which buffer)
-  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
-                                       CommandQueue &queue) {
-    std::vector<T> result(args.c_size, static_cast<T>(0));
-    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
-    return result;
-  }
-
-  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
-  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
-  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
-    return (args.layout == Layout::kRowMajor) ?
-           id1*args.c_ld + id2 + args.c_offset:
-           id2*args.c_ld + id1 + args.c_offset;
-  }
-
-  // Describes how to compute performance metrics
-  static size_t GetFlops(const Arguments<T> &args) {
-    return 2 * args.m * args.n * args.m;
-  }
-  static size_t GetBytes(const Arguments<T> &args) {
-    return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
-  }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_ROUTINES_XHEMM_H_
-#endif
diff --git a/test/routines/xher2k.h b/test/routines/xher2k.h
deleted file mode 100644
index f13e8a62..00000000
--- a/test/routines/xher2k.h
+++ /dev/null
@@ -1,132 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements a class with static methods to describe the Xher2k routine. Examples of
-// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
-// static methods are used by the correctness tester and the performance tester.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_ROUTINES_XHER2K_H_
-#define CLBLAST_TEST_ROUTINES_XHER2K_H_
-
-#include <vector>
-#include <string>
-
-#include "wrapper_clblas.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T, typename U>
-class TestXher2k {
- public:
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() {
-    return {kArgN, kArgK,
-            kArgLayout, kArgTriangle, kArgATransp,
-            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-            kArgAOffset, kArgBOffset, kArgCOffset,
-            kArgAlpha, kArgBeta};
-  }
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeA(const Arguments<U> &args) {
-    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-    auto a_two = (a_rotated) ? args.n : args.k;
-    return a_two * args.a_ld + args.a_offset;
-  }
-  static size_t GetSizeB(const Arguments<U> &args) {
-    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
-                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
-    auto b_two = (b_rotated) ? args.n : args.k;
-    return b_two * args.b_ld + args.b_offset;
-  }
-  static size_t GetSizeC(const Arguments<U> &args) {
-    return args.n * args.c_ld + args.c_offset;
-  }
-
-  // Describes how to set the sizes of all the buffers
-  static void SetSizes(Arguments<U> &args) {
-    args.a_size = GetSizeA(args);
-    args.b_size = GetSizeB(args);
-    args.c_size = GetSizeC(args);
-  }
-
-  // Describes what the default values of the leading dimensions of the matrices are
-  static size_t DefaultLDA(const Arguments<U> &args) { return args.k; }
-  static size_t DefaultLDB(const Arguments<U> &args) { return args.k; }
-  static size_t DefaultLDC(const Arguments<U> &args) { return args.n; }
-
-  // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<U> &args, const Buffers &buffers,
-                               CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto alpha2 = T{args.alpha, args.alpha};
-    auto status = Her2k(args.layout, args.triangle, args.a_transpose,
-                        args.n, args.k, alpha2,
-                        buffers.a_mat(), args.a_offset, args.a_ld,
-                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                        buffers.c_mat(), args.c_offset, args.c_ld,
-                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    return status;
-  }
-
-  // Describes how to run the clBLAS routine (for correctness/performance comparison)
-  static StatusCode RunReference(const Arguments<U> &args, const Buffers &buffers,
-                                 CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto alpha2 = T{args.alpha, args.alpha};
-    auto status = clblasXher2k(static_cast<clblasOrder>(args.layout),
-                               static_cast<clblasUplo>(args.triangle),
-                               static_cast<clblasTranspose>(args.a_transpose),
-                               args.n, args.k, alpha2,
-                               buffers.a_mat(), args.a_offset, args.a_ld,
-                               buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                               buffers.c_mat(), args.c_offset, args.c_ld,
-                               1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    return static_cast<StatusCode>(status);
-  }
-
-  // Describes how to download the results of the computation (more importantly: which buffer)
-  static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers &buffers,
-                                       CommandQueue &queue) {
-    std::vector<T> result(args.c_size, static_cast<T>(0));
-    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
-    return result;
-  }
-
-  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<U> &args) { return args.n; }
-  static size_t ResultID2(const Arguments<U> &args) { return args.n; }
-  static size_t GetResultIndex(const Arguments<U> &args, const size_t id1, const size_t id2) {
-    return id1*args.c_ld + id2 + args.c_offset;
-  }
-
-  // Describes how to compute performance metrics
-  static size_t GetFlops(const Arguments<U> &args) {
-    return 2 * args.n * args.n * args.k;
-  }
-  static size_t GetBytes(const Arguments<U> &args) {
-    return (args.n*args.k + args.n*args.n) * sizeof(T);
-  }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_ROUTINES_XHER2K_H_
-#endif
diff --git a/test/routines/xherk.h b/test/routines/xherk.h
deleted file mode 100644
index 780b9b52..00000000
--- a/test/routines/xherk.h
+++ /dev/null
@@ -1,121 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements a class with static methods to describe the Xherk routine. Examples of
-// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
-// static methods are used by the correctness tester and the performance tester.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_ROUTINES_XHERK_H_
-#define CLBLAST_TEST_ROUTINES_XHERK_H_
-
-#include <vector>
-#include <string>
-
-#include "wrapper_clblas.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T, typename U>
-class TestXherk {
- public:
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() {
-    return {kArgN, kArgK,
-            kArgLayout, kArgTriangle, kArgATransp,
-            kArgALeadDim, kArgCLeadDim,
-            kArgAOffset, kArgCOffset,
-            kArgAlpha, kArgBeta};
-  }
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeA(const Arguments<U> &args) {
-    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-    auto a_two = (a_rotated) ? args.n : args.k;
-    return a_two * args.a_ld + args.a_offset;
-  }
-  static size_t GetSizeC(const Arguments<U> &args) {
-    return args.n * args.c_ld + args.c_offset;
-  }
-
-  // Describes how to set the sizes of all the buffers
-  static void SetSizes(Arguments<U> &args) {
-    args.a_size = GetSizeA(args);
-    args.c_size = GetSizeC(args);
-  }
-
-  // Describes what the default values of the leading dimensions of the matrices are
-  static size_t DefaultLDA(const Arguments<U> &args) { return args.k; }
-  static size_t DefaultLDB(const Arguments<U> &) { return 1; } // N/A for this routine
-  static size_t DefaultLDC(const Arguments<U> &args) { return args.n; }
-
-  // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<U> &args, const Buffers &buffers,
-                               CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Herk(args.layout, args.triangle, args.a_transpose,
-                       args.n, args.k, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
-                       buffers.c_mat(), args.c_offset, args.c_ld,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    return status;
-  }
-
-  // Describes how to run the clBLAS routine (for correctness/performance comparison)
-  static StatusCode RunReference(const Arguments<U> &args, const Buffers &buffers,
-                                 CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXherk(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasUplo>(args.triangle),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              args.n, args.k, args.alpha,
-                              buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
-                              buffers.c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    return static_cast<StatusCode>(status);
-  }
-
-  // Describes how to download the results of the computation (more importantly: which buffer)
-  static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers &buffers,
-                                       CommandQueue &queue) {
-    std::vector<T> result(args.c_size, static_cast<T>(0));
-    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
-    return result;
-  }
-
-  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<U> &args) { return args.n; }
-  static size_t ResultID2(const Arguments<U> &args) { return args.n; }
-  static size_t GetResultIndex(const Arguments<U> &args, const size_t id1, const size_t id2) {
-    return id1*args.c_ld + id2 + args.c_offset;
-  }
-
-  // Describes how to compute performance metrics
-  static size_t GetFlops(const Arguments<U> &args) {
-    return args.n * args.n * args.k;
-  }
-  static size_t GetBytes(const Arguments<U> &args) {
-    return (args.n*args.k + args.n*args.n) * sizeof(T);
-  }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_ROUTINES_XHERK_H_
-#endif
diff --git a/test/routines/xsymm.h b/test/routines/xsymm.h
deleted file mode 100644
index 10476349..00000000
--- a/test/routines/xsymm.h
+++ /dev/null
@@ -1,134 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements a class with static methods to describe the Xsymm routine. Examples of
-// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
-// static methods are used by the correctness tester and the performance tester.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_ROUTINES_XSYMM_H_
-#define CLBLAST_TEST_ROUTINES_XSYMM_H_
-
-#include <vector>
-#include <string>
-
-#include "wrapper_clblas.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestXsymm {
- public:
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() {
-    return {kArgM, kArgN,
-            kArgLayout, kArgSide, kArgTriangle,
-            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-            kArgAOffset, kArgBOffset, kArgCOffset,
-            kArgAlpha, kArgBeta};
-  }
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeA(const Arguments<T> &args) {
-    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
-    auto a_rotated = (args.layout == Layout::kRowMajor);
-    auto a_two = (a_rotated) ? args.m : k_value;
-    return a_two * args.a_ld + args.a_offset;
-  }
-  static size_t GetSizeB(const Arguments<T> &args) {
-    size_t k_value = (args.side == Side::kLeft) ? args.m : args.n;
-    auto b_rotated = (args.layout == Layout::kRowMajor);
-    auto b_two = (b_rotated) ? k_value : args.n;
-    return b_two * args.b_ld + args.b_offset;
-  }
-  static size_t GetSizeC(const Arguments<T> &args) {
-    auto c_rotated = (args.layout == Layout::kRowMajor);
-    auto c_two = (c_rotated) ? args.m : args.n;
-    return c_two * args.c_ld + args.c_offset;
-  }
-
-  // Describes how to set the sizes of all the buffers
-  static void SetSizes(Arguments<T> &args) {
-    args.a_size = GetSizeA(args);
-    args.b_size = GetSizeB(args);
-    args.c_size = GetSizeC(args);
-  }
-
-  // Describes what the default values of the leading dimensions of the matrices are
-  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
-  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
-  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
-
-  // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
-                               CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Symm(args.layout, args.side, args.triangle,
-                       args.m, args.n, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                       buffers.c_mat(), args.c_offset, args.c_ld,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    return status;
-  }
-
-  // Describes how to run the clBLAS routine (for correctness/performance comparison)
-  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
-                                 CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasSide>(args.side),
-                              static_cast<clblasUplo>(args.triangle),
-                              args.m, args.n, args.alpha,
-                              buffers.a_mat(), args.a_offset, args.a_ld,
-                              buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                              buffers.c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    return static_cast<StatusCode>(status);
-  }
-
-  // Describes how to download the results of the computation (more importantly: which buffer)
-  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
-                                       CommandQueue &queue) {
-    std::vector<T> result(args.c_size, static_cast<T>(0));
-    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
-    return result;
-  }
-
-  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
-  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
-  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
-    return (args.layout == Layout::kRowMajor) ?
-           id1*args.c_ld + id2 + args.c_offset:
-           id2*args.c_ld + id1 + args.c_offset;
-  }
-
-  // Describes how to compute performance metrics
-  static size_t GetFlops(const Arguments<T> &args) {
-    return 2 * args.m * args.n * args.m;
-  }
-  static size_t GetBytes(const Arguments<T> &args) {
-    return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
-  }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_ROUTINES_XSYMM_H_
-#endif
diff --git a/test/routines/xsyr2k.h b/test/routines/xsyr2k.h
deleted file mode 100644
index f3b1b542..00000000
--- a/test/routines/xsyr2k.h
+++ /dev/null
@@ -1,130 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements a class with static methods to describe the Xsyr2k routine. Examples of
-// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
-// static methods are used by the correctness tester and the performance tester.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_ROUTINES_XSYR2K_H_
-#define CLBLAST_TEST_ROUTINES_XSYR2K_H_
-
-#include <vector>
-#include <string>
-
-#include "wrapper_clblas.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestXsyr2k {
- public:
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() {
-    return {kArgN, kArgK,
-            kArgLayout, kArgTriangle, kArgATransp,
-            kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
-            kArgAOffset, kArgBOffset, kArgCOffset,
-            kArgAlpha, kArgBeta};
-  }
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeA(const Arguments<T> &args) {
-    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-    auto a_two = (a_rotated) ? args.n : args.k;
-    return a_two * args.a_ld + args.a_offset;
-  }
-  static size_t GetSizeB(const Arguments<T> &args) {
-    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
-                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
-    auto b_two = (b_rotated) ? args.n : args.k;
-    return b_two * args.b_ld + args.b_offset;
-  }
-  static size_t GetSizeC(const Arguments<T> &args) {
-    return args.n * args.c_ld + args.c_offset;
-  }
-
-  // Describes how to set the sizes of all the buffers
-  static void SetSizes(Arguments<T> &args) {
-    args.a_size = GetSizeA(args);
-    args.b_size = GetSizeB(args);
-    args.c_size = GetSizeC(args);
-  }
-
-  // Describes what the default values of the leading dimensions of the matrices are
-  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
-  static size_t DefaultLDB(const Arguments<T> &args) { return args.k; }
-  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
-
-  // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
-                               CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
-                        args.n, args.k, args.alpha,
-                        buffers.a_mat(), args.a_offset, args.a_ld,
-                        buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                        buffers.c_mat(), args.c_offset, args.c_ld,
-                        &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    return status;
-  }
-
-  // Describes how to run the clBLAS routine (for correctness/performance comparison)
-  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
-                                 CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
-                               static_cast<clblasUplo>(args.triangle),
-                               static_cast<clblasTranspose>(args.a_transpose),
-                               args.n, args.k, args.alpha,
-                               buffers.a_mat(), args.a_offset, args.a_ld,
-                               buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                               buffers.c_mat(), args.c_offset, args.c_ld,
-                               1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    return static_cast<StatusCode>(status);
-  }
-
-  // Describes how to download the results of the computation (more importantly: which buffer)
-  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
-                                       CommandQueue &queue) {
-    std::vector<T> result(args.c_size, static_cast<T>(0));
-    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
-    return result;
-  }
-
-  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
-  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
-  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
-    return id1*args.c_ld + id2 + args.c_offset;
-  }
-
-  // Describes how to compute performance metrics
-  static size_t GetFlops(const Arguments<T> &args) {
-    return 2 * args.n * args.n * args.k;
-  }
-  static size_t GetBytes(const Arguments<T> &args) {
-    return (args.n*args.k + args.n*args.n) * sizeof(T);
-  }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_ROUTINES_XSYR2K_H_
-#endif
diff --git a/test/routines/xsyrk.h b/test/routines/xsyrk.h
deleted file mode 100644
index 2ec9fb65..00000000
--- a/test/routines/xsyrk.h
+++ /dev/null
@@ -1,121 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements a class with static methods to describe the Xsyrk routine. Examples of
-// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
-// static methods are used by the correctness tester and the performance tester.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_ROUTINES_XSYRK_H_
-#define CLBLAST_TEST_ROUTINES_XSYRK_H_
-
-#include <vector>
-#include <string>
-
-#include "wrapper_clblas.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestXsyrk {
- public:
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() {
-    return {kArgN, kArgK,
-            kArgLayout, kArgTriangle, kArgATransp,
-            kArgALeadDim, kArgCLeadDim,
-            kArgAOffset, kArgCOffset,
-            kArgAlpha, kArgBeta};
-  }
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeA(const Arguments<T> &args) {
-    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
-                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
-    auto a_two = (a_rotated) ? args.n : args.k;
-    return a_two * args.a_ld + args.a_offset;
-  }
-  static size_t GetSizeC(const Arguments<T> &args) {
-    return args.n * args.c_ld + args.c_offset;
-  }
-
-  // Describes how to set the sizes of all the buffers
-  static void SetSizes(Arguments<T> &args) {
-    args.a_size = GetSizeA(args);
-    args.c_size = GetSizeC(args);
-  }
-
-  // Describes what the default values of the leading dimensions of the matrices are
-  static size_t DefaultLDA(const Arguments<T> &args) { return args.k; }
-  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
-  static size_t DefaultLDC(const Arguments<T> &args) { return args.n; }
-
-  // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
-                               CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Syrk(args.layout, args.triangle, args.a_transpose,
-                       args.n, args.k, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
-                       buffers.c_mat(), args.c_offset, args.c_ld,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    return status;
-  }
-
-  // Describes how to run the clBLAS routine (for correctness/performance comparison)
-  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
-                                 CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasUplo>(args.triangle),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              args.n, args.k, args.alpha,
-                              buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
-                              buffers.c_mat(), args.c_offset, args.c_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    return static_cast<StatusCode>(status);
-  }
-
-  // Describes how to download the results of the computation (more importantly: which buffer)
-  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
-                                       CommandQueue &queue) {
-    std::vector<T> result(args.c_size, static_cast<T>(0));
-    buffers.c_mat.ReadBuffer(queue, args.c_size*sizeof(T), result);
-    return result;
-  }
-
-  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.n; }
-  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
-  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
-    return id1*args.c_ld + id2 + args.c_offset;
-  }
-
-  // Describes how to compute performance metrics
-  static size_t GetFlops(const Arguments<T> &args) {
-    return args.n * args.n * args.k;
-  }
-  static size_t GetBytes(const Arguments<T> &args) {
-    return (args.n*args.k + args.n*args.n) * sizeof(T);
-  }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_ROUTINES_XSYRK_H_
-#endif
diff --git a/test/routines/xtrmm.h b/test/routines/xtrmm.h
deleted file mode 100644
index 7b7e7af1..00000000
--- a/test/routines/xtrmm.h
+++ /dev/null
@@ -1,127 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements a class with static methods to describe the Xtrmm routine. Examples of
-// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
-// static methods are used by the correctness tester and the performance tester.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_TEST_ROUTINES_XTRMM_H_
-#define CLBLAST_TEST_ROUTINES_XTRMM_H_
-
-#include <vector>
-#include <string>
-
-#include "wrapper_clblas.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class TestXtrmm {
- public:
-
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() {
-    return {kArgM, kArgN,
-            kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal,
-            kArgALeadDim, kArgBLeadDim,
-            kArgAOffset, kArgBOffset,
-            kArgAlpha};
-  }
-
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeA(const Arguments<T> &args) {
-    auto k = (args.side == Side::kLeft) ? args.m : args.n;
-    return k * args.a_ld + args.a_offset;
-  }
-  static size_t GetSizeB(const Arguments<T> &args) {
-    auto b_rotated = (args.layout == Layout::kRowMajor);
-    auto b_two = (b_rotated) ? args.m : args.n;
-    return b_two * args.b_ld + args.b_offset;
-  }
-
-  // Describes how to set the sizes of all the buffers
-  static void SetSizes(Arguments<T> &args) {
-    args.a_size = GetSizeA(args);
-    args.b_size = GetSizeB(args);
-  }
-
-  // Describes what the default values of the leading dimensions of the matrices are
-  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
-  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
-  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
-
-  // Describes how to run the CLBlast routine
-  static StatusCode RunRoutine(const Arguments<T> &args, const Buffers &buffers,
-                               CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
-                       args.m, args.n, args.alpha,
-                       buffers.a_mat(), args.a_offset, args.a_ld,
-                       buffers.b_mat(), args.b_offset, args.b_ld,
-                       &queue_plain, &event);
-    clWaitForEvents(1, &event);
-    return status;
-  }
-
-  // Describes how to run the clBLAS routine (for correctness/performance comparison)
-  static StatusCode RunReference(const Arguments<T> &args, const Buffers &buffers,
-                                 CommandQueue &queue) {
-    auto queue_plain = queue();
-    auto event = cl_event{};
-    auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout),
-                              static_cast<clblasSide>(args.side),
-                              static_cast<clblasUplo>(args.triangle),
-                              static_cast<clblasTranspose>(args.a_transpose),
-                              static_cast<clblasDiag>(args.diagonal),
-                              args.m, args.n, args.alpha,
-                              buffers.a_mat(), args.a_offset, args.a_ld,
-                              buffers.b_mat(), args.b_offset, args.b_ld,
-                              1, &queue_plain, 0, nullptr, &event);
-    clWaitForEvents(1, &event);
-    return static_cast<StatusCode>(status);
-  }
-
-  // Describes how to download the results of the computation (more importantly: which buffer)
-  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers &buffers,
-                                       CommandQueue &queue) {
-    std::vector<T> result(args.b_size, static_cast<T>(0));
-    buffers.b_mat.ReadBuffer(queue, args.b_size*sizeof(T), result);
-    return result;
-  }
-
-  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
-  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
-  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
-    return (args.layout == Layout::kRowMajor) ?
-           id1*args.b_ld + id2 + args.b_offset:
-           id2*args.b_ld + id1 + args.b_offset;
-  }
-
-  // Describes how to compute performance metrics
-  static size_t GetFlops(const Arguments<T> &args) {
-    auto k = (args.side == Side::kLeft) ? args.m : args.n;
-    return args.m * args.n * k;
-  }
-  static size_t GetBytes(const Arguments<T> &args) {
-    auto k = (args.side == Side::kLeft) ? args.m : args.n;
-    return (k*k + 2*args.m*args.n) * sizeof(T);
-  }
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_TEST_ROUTINES_XTRMM_H_
-#endif
-- 
cgit v1.2.3